knitr::opts_chunk$set(echo = TRUE) 

1.0. Read in sampeled data:

The first step is to read in the sampled species names. These names are assigned to a vector.

1.1. Percentage representation of parietal eye status by family:

1.2. Plot family composition of parietal eye:

library(ggplot2)
ggplot(family_composition, aes(x = Family)) + 
  geom_bar(aes(y = percent_present, fill = "1"), stat = "identity") + 
  geom_bar(aes(y = percent_absent, fill = "0"), stat = "identity") + # specify the two categories "1" and "0"
  labs(x = "Family", y = "Percentage", fill = "Parietal eye value:") + # assign bars to columns
  ggtitle("Composition of families by parietal eye value:") + # add title
  scale_fill_manual(values = c("1" = "green", "0" = "red"), 
                    name = "Parietal eye value:", 
                    labels = c("absent", "present")) + # assign colors according to parietal eye state
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1),
        title = element_text(face = "bold")) # adjust x-axis text by angel and change legend title to bold

library(knitr)
library(dplyr)
library(tidyr)

# filter families and exclude families with less than 10 representatives
filtered_data <- data_families %>%
  group_by(Family) %>% # group by the families of the species
  filter(n() > 10) %>%
  ungroup()

#saveRDS(filtered_data, file="C:/Users/Daniel/Desktop/results_R/filtered_data.RDS")

# summarise all data for the table
summary_table <- filtered_data %>%
  group_by(Family) %>%
  summarise(
    Present = mean(parietal_eye == 1) * 100,  # present prietal eye percentage
    Absent = 100 - Present,  # absent parietal eye percentage
    `Total count` = n()  # family count
  )

# adjust the table structure by adding column names to it
summary_table <- summary_table %>%
  pivot_longer(cols = c(Present, Absent), names_to = "Eye_Status", values_to = "Percentage") %>%
  pivot_wider(names_from = Eye_Status, values_from = Percentage)

# name the columns
summary_table <- summary_table[, c("Family", "Present", "Absent", "Total count")]

summary_table

Procedure for uncleaned data:

The first analyses and calculations are carried out with “uncleaned data”. Later, the data will be cleaned for more precise results. This comparison should also provide information about the quality of the cleaned data compared to the uncleaned data. Here are the results of the uncleaned data.

2.1. Download coordinates of GBIF (uncleaned):

At the very beginning, based on the sampled species names and the created “species_vector”, data about “scientificName”, “decimalLongitude” and “decimalLatitude” are downloaded from GBIF and put into a list called “gbif_data_uncleaned”:

library(rgbif)

gbif_data_uncleaned <- vector(mode = "list", length = length(species_vector)) 
for (i in 1:length(species_vector)) { # loop through "species_vector"
  results <- occ_data(scientificName = species_vector[i], hasCoordinate = TRUE, limit = 500)  #limit is set to n= 500 for downloaded cases of each species  
  results <- as.data.frame(results$data, col.names= c("scientificName", "decimalLongitude", "decimalLatitude")) # assignment of downloaded data to columns
  if (nrow(results) > 0) {
    gbif_data_uncleaned[[i]] <- results[,c("scientificName", "decimalLongitude", "decimalLatitude")]
    names(gbif_data_uncleaned)[i] <- results$scientificName[1]
  } else {
    gbif_data_uncleaned[[i]] <- NA # conditional statement differentiates between available and not available data
  } 
}
gbif_data_uncleaned
# saveRDS(gbif_data_uncleaned, file="C:/Users/Daniel/Desktop/results_R/gbif_data_uncleaned.RDS")

gbif_data_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/gbif_data_uncleaned.RDS")

The end of the list is shown as an example. This way you can see if the function that will be used later to clean up the downloaded coordinates is essential.

gbif_data_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/gbif_data_uncleaned.RDS")
tail_gbif_data_uncleaned<- tail(gbif_data_uncleaned$`Zonosaurus ornatus (Gray, 1831)`,22)
tail_gbif_data_uncleaned

Furthermore, the total number of downloaded individuals is re cored to get an overview about the sample size:

# counting total rows of gbif_data_uncleaned:  
total_rows_gbif_data_uncleaned <- 0
for (i in 1:length(gbif_data_uncleaned)) { # iteration through the list
  if (inherits(gbif_data_uncleaned[[i]], "data.frame") || inherits(gbif_data_uncleaned[[i]], "tbl_df")) {
    total_rows_gbif_data_uncleaned <- total_rows_gbif_data_uncleaned + nrow(gbif_data_uncleaned[[i]]) # adding up the rows 
  }
}
print(total_rows_gbif_data_uncleaned)
## [1] 175521

3.1. Centroid distribution of species (uncleaned):

The median of the coordinates for each species is calculated because of the large number of individuals and also because it is more accurate.

library(dplyr)
library(purrr)

coordsmedian_uncleaned <- gbif_data_uncleaned %>%
  map_df(~ .x %>%
           group_by(scientificName) %>% # group the data by their "scientificName"
           summarise(n = n(),
                     medianLat = median(decimalLatitude, na.rm = TRUE), # calculate median of latitude
                     medianLon = median(decimalLongitude, na.rm = TRUE))) %>% # calculate median of longitude
  group_by(scientificName) %>%
  summarise(medianLat = sum(n * medianLat) / sum(n),  # calculate the mean latitude for each species
            medianLon = sum(n * medianLon) / sum(n))  # calculate the mean longitude for each species
coordsmedian_uncleaned

#saveRDS(coordsmedian_uncleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian_uncleaned.RDS")

#coordsmedian_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian_uncleaned.RDS")

4.1. Parietal eye in dependency on median latitude (uncleaned):

At this point the sampled state of the parietal eye for each species is added to a new column in the list “coordsmedian.parietal_uncleaned”. In addition, all species that do not match the names in the Excel table are excluded from the final list.

coordsmedian.parietal_uncleaned<- coordsmedian_uncleaned
coordsmedian.parietal_uncleaned$parietal_eye <- NA # add a new column
for (i in 1:nrow(squamata_Amph.Lac.Serp_xl)) { # Loop through rows of "squamata_Amph.Lac.Serp_xl"
  match_row <- which(sapply(strsplit(coordsmedian.parietal_uncleaned$scientificName, " "), function(x) any(x %in% strsplit(squamata_Amph.Lac.Serp_xl$Taxon[i], " ")[[1]]))) # find matching rows
  if (length(match_row) > 0) {
    coordsmedian.parietal_uncleaned$parietal_eye[match_row] <- squamata_Amph.Lac.Serp_xl$parietal_eye[i] # add the information to new column
  }
}
coordsmedian.parietal_uncleaned

#saveRDS(coordsmedian.parietal_uncleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal_uncleaned.RDS")
#coordsmedian_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal_uncleaned.RDS")

coordsmedian.parietalfiltered_uncleaned <- coordsmedian.parietal_uncleaned %>% # filter "coordsmean_uncleaned"
  filter(parietal_eye == 1 | parietal_eye == 0)

#saveRDS(coordsmedian.parietalfiltered_uncleaned, #file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietalfiltered_uncleaned.RDS")

#coordsmedian.parietalfiltered_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietalfiltered_uncleaned.RDS")

5.1. Geographical world map (uncleaned):

The state of the parietal eye within the sampled squamates is then plotted on a world map to provide an insight into the median distribution of each species with information on the state of the parietal eye.

# plot median geographical distribution by state of parietal eye - non interactive world map (uncleaned)
library(maps)
library(ggplot2)

coordsmedian.parietalfiltered_uncleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietalfiltered_uncleaned.RDS")
world_map_uncleaned <- map_data("world") # load data for the world map
point_colors <- c("#FF0000", "#99CC00") # assign colors to the two categories of the parietal eye
ggplot() +
  geom_polygon(data = world_map_uncleaned, aes(x = long, y = lat, group = group), fill = "#99CCFF", color = "#00FFCC") + # plot the world map
  geom_point(data = coordsmedian.parietalfiltered_uncleaned, aes(x = medianLon, y = medianLat, color = factor(parietal_eye)), # plot the median geographical coordinates of the species
             shape = 1,  
             size = 0.5) +  
  xlab("Longitude") +
  ylab("Latitude") +
  coord_fixed() +
  theme_bw() +
  labs(color = "State of parietal eye:") +
  ggtitle(expression(bold("Median geographical distribution by parietal eye (uncleaned data):"))) +
  scale_color_manual(values = c("#FF0000", "#99CC00"), breaks = c("0", "1"), # two categories of parietal eye
                     labels = c("absent", "present"), 
                     guide = guide_legend(override.aes = list(shape = 1, size = 3))) +  # Adjust legend 
  theme(legend.key.size = unit(0.3, "cm"),
        plot.title = element_text(size = rel(1), face = "bold")) 

7.0. Statistical analysation of median latitude (uncleaned):

In the following part, the dependence of the different median latitudinal distributions on the state of the parietal eye is compared. First, the globe is split into its two hemispheres to prevent the values from cancelling each other out. from cancelling each other out, and to see the differences in the two gradients towards the poles, since the Earth’s mass is not distributed centrally between the hemispheres, but more towards the north.

7.1.1. Statistics of north hemisphere (uncleaned):

coordsmediannorth_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(medianLat >0 ) # Filter data for North Hemisphere

# Create boxplots with median, 25- and 75-quantiles
ggplot(coordsmediannorth_uncleaned %>% filter(medianLat > 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + # Adjust boxplot width
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + #calculate median
    stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (North Hemisphere uncleaned):") + 
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend with the two categories
  labs(fill = "Parietal eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.2.1. Statistics of north hemisphere for coordinates in America (uncleaned):

The same procedure is done for the northern hemisphere, but for species living between the coordinates -125° and -30° degrees. This includes North and South America.

library(ggplot2)

coordsmediannorth_america_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(medianLon >= -125 & medianLon <= -30 & medianLat > 0) # Filter data between -30° and -125° longitude for North American species

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediannorth_america_uncleaned %>% filter(medianLat > 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + # Adjust boxplot width
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25 quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75 quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (American North Hemisphere uncleaned):") + 
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.3.1. Statistics of north hemisphere for all coordinates except America (uncleaned):

Despite the New World squamates, the Old World squamates are also analysed for the state of the parietal eye as a function of the median latitudinal distribution. For this procedure, the coordinate range is the opposite of that used for the Americas.

library(ggplot2)

coordsmediannorth_not_america_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(!(medianLon >= -125 & medianLon <= -30) & medianLat > 0) # Filter data that are not in -30° and -125° longitude to exclude American species

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediannorth_not_america_uncleaned %>% filter(medianLat > 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + # Adjust boxplot width
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (Not American North Hemisphere uncleaned):") + 
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.4.1. Statistics of south hemisphere (uncleaned):

The same procedure is used for the southern hemisphere. The structure of the code remains the same, the only difference being the selection of squamates whose species median latitude is in the southern part of the equator.

library(ggplot2)

coordsmediansouth_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(medianLat < 0 ) # Filter data for South Hemisphere 

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediansouth_uncleaned %>% filter(medianLat < 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + # Adjust boxplot width
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75-uantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (South Hemisphere uncleaned):") + 
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.5.1. Statistics of south hemisphere for coordinates in America (uncleaned):

As above for the species living in the northern hemisphere of the Americas, here is the counterpart for the southern hemisphere.

library(ggplot2)

coordsmediansouth_america_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(medianLon >= -125 & medianLon <= -30 & medianLat < 0) # Filter data that are in -30° and -125° longitude for South Hemisphere in America

# Create boxplots with mean, 25-and 75-quantiles
ggplot(coordsmediansouth_america_uncleaned, aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25-qunatile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (American South Hemisphere uncleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.6.1. Statistics of south hemisphere for all coordinates except America (uncleaned):

Here is the calculation of species that have their median latitudinal distribution in the southern hemisphere of the globe, excluding species from America.

library(ggplot2)

coordsmediansouth_not_america_uncleaned <- coordsmedian.parietalfiltered_uncleaned %>%
  filter(!(medianLon >= -125 & medianLon <= -30) & medianLat < 0) # Filter data that are in -30° and -125° longitude for rest of the world despite America

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediansouth_not_america_uncleaned, aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate #median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 25-qunatile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate 75-qunatile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (Not American South Hemisphere uncleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend labels
  labs(fill = "Parietal Eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), # Increase title size
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

In addition, a statistical evaluation of the dependence of the parietal eye state on the median latitudinal distribution of squamates in certain areas of the world is made.

8.1.1. t-test of North Hemisphere (uncleaned):

t_test_northhemisphere_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_uncleaned)
print(t_test_northhemisphere_uncleaned)

8.2.1. t-test of North Hemisphere for coordinates of America (uncleaned):

t_test_northhemisphere_america_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_america_uncleaned)
print(t_test_northhemisphere_america_uncleaned)

8.3.1. t-test of North Hemisphere for coordinates except America (uncleaned):

t_test_northhemisphere_not_america_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_not_america_uncleaned)
print(t_test_northhemisphere_not_america_uncleaned)

8.4.1. t-test of South Hemisphere (uncleaned):

t_test_southhemisphere_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_uncleaned)
print(t_test_southhemisphere_uncleaned)

8.5.1. t-test of South Hemisphere for coordinates of America (uncleaned):

t_test_southhemisphere_america_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_america_uncleaned)
print(t_test_southhemisphere_america_uncleaned)

8.6.1. t-test of South Hemisphere for coordinates except America (uncleaned):

t_test_southhemisphere_not_america_uncleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_not_america_uncleaned)
print(t_test_southhemisphere_not_america_uncleaned)

Procedure for cleaned data:

At this point it is important to consider the accuracy of the data stored in “gbif_data_uncleaned”. There are several cases that indicate biased medians due to duplicate or incorrect coordinates, here is a little more insight into the data stored in “gbif_data_uncleand”.

tail_gbif_data_uncleaned<- tail(gbif_data_uncleaned$`Zonosaurus ornatus (Gray, 1831)`,22)
tail_gbif_data_uncleaned

2.2. Download coordinates of GBIF (cleaned):

It becomes clear that many of the downloaded data distort the median distribution of the species. To get rid of these coordinates, there is a useful package called “CoordinateCleaner”. Its functions remove duplicate or incorrect coordinates. In this way a more correct median distribution of the species can be calculated.

#extract with "CoordinateCleaner" from "GBIF" information about "scientificName", "decimalLongitude"and "decimalLatitude" and put them in a list called "gbif_data_cleaned" 
library(sp)
library(CoordinateCleaner)

gbif_data_cleaned <- vector(mode = "list", length = length(squamata_Amph.Lac.Serp_xl$Taxon)) # create a vector
removed_data_count <- 0 # start the count of removed data
for (i in 1:length(species_vector)) { # Iteration through species names
  results <- occ_data(scientificName = species_vector[i], hasCoordinate = TRUE, limit = 500)  # set download limit for each speces to n=500
  results <- as.data.frame(results$data, col.names = c("scientificName", "decimalLongitude", "decimalLatitude")) # store the results in the named columns
  if (nrow(results) > 0) {
    results_clean <- results
    before_cleaning <- nrow(results_clean) 
    results_clean <- cc_cap(results_clean) # remove coordinates of capitals
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_cen(results_clean) # remove coordinates of country centroids
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_dupl(results_clean) # remove duplicated coordinates
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_equ(results_clean)
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_gbif(results_clean) # remove coordinates of gbif headquaters
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_inst(results_clean) # remove coordinates of biological institutions
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean))
    results_clean <- cc_val(results_clean) # remove invalid coordinates
    removed_data_count <- removed_data_count + (before_cleaning - nrow(results_clean)) #count total number of removed coordinates 
    
    gbif_data_cleaned[[i]] <- results_clean[, c("scientificName", "decimalLongitude", "decimalLatitude")]
    names(gbif_data_cleaned)[i] <- species_vector[i]
  } else {
    gbif_data_cleaned[[i]] <- NA
  }
}
gbif_data_cleaned
# saveRDS(gbif_data_uncleaned, file="C:/Users/Daniel/Desktop/results_R/gbif_data_cleaned.RDS")
# gbif_data_cleaned<-readRDS("C:/Users/Daniel/Desktop/results_R/gbif_data_cleaned.RDS")

#gbif_data_cleaned<-readRDS("C:/Users/Daniel/Desktop/results_R/gbif_data_cleaned.RDS)

The total number of removed coordinates is also printed to give an idea of the huge potential of the “CoordinateCleaner” package:

removed_data_count <- readRDS("C:/Users/Daniel/Desktop/results_R/removed_data_count.RDS")
removed_data_count
## [1] 333108

All of these removed data may bias the result of the uncleaned analysis above.

2.2.1. Total number of rows in “gbif_data_cleaned”:

Additionally here is a display of the sample size of individuals within “gbif_data_cleaned”.

total_rows_gbif_data_cleaned <- 0
for (i in 1:length(gbif_data_cleaned)) {
  if (inherits(gbif_data_cleaned[[i]], "data.frame") || inherits(gbif_data_cleaned[[i]], "tbl_df")) {
    total_rows_gbif_data_cleaned <- total_rows_gbif_data_cleaned + nrow(gbif_data_cleaned[[i]])
  }
}

print(total_rows_gbif_data_cleaned)

With a sample size that is from such an enormous size and even is represented by cleaned coordinates, the statistical evaluation can be repeated.

3.2. Centroid distribution of species (cleaned):

# calculate the centroid distribution of each species from "gbif_data_cleaned":
coordsmedian_cleaned <- gbif_data_cleaned %>% #pipe in data of "gbif_data_uncleaned"
  map_df(~ .x %>%
           group_by(scientificName) %>% #group the data by "scientificName"
           summarise(n = n(),
                     medianLat = median(decimalLatitude, na.rm = TRUE), 
                     medianLon = median(decimalLongitude, na.rm = TRUE))) %>% 
  group_by(scientificName) %>%
  summarise(medianLat = sum(n * medianLat) / sum(n), # calculate the median latitude for each species
            medianLon = sum(n * medianLon) / sum(n)) # calculate the median longitude for each species
coordsmedian_cleaned
#saveRDS(coordsmedian_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian_cleaned.RDS")

4.2. Parietal eye in dependence on coordinates (cleaned):

Next, the median coordinates of each species are again made dependent on the state of the parietal eye that has been sampled. A final step is to remove the “BOLD” species from the list. Furthermore, the list is cleared of species with no parietal eye state information available.

# add a new column to "coordsmean_uncleaned" called "parietal_eye" with information of "squamata_Amph.Lac.Serp_xl" and exclude GBIF Backbone Taxonomy (BOLD) in the list "coordsmedianfiltered.parietal_uncleaned"
coordsmedian.parietal_cleaned<- coordsmedian_cleaned
coordsmedian.parietal_cleaned$parietal_eye <- NA # create a new column
for (i in 1:nrow(squamata_Amph.Lac.Serp_xl)) { # Loop through rows of "squamata_Amph.Lac.Serp_xl"
  match_row <- which(sapply(strsplit(coordsmedian.parietal_cleaned$scientificName, " "), function(x) any(x %in% strsplit(squamata_Amph.Lac.Serp_xl$Taxon[i], " ")[[1]])))
  if (length(match_row) > 0) {
    coordsmedian.parietal_cleaned$parietal_eye[match_row] <- squamata_Amph.Lac.Serp_xl$parietal_eye[i] # add information to new column if there are matching names
  }
}
coordsmedian.parietal_cleaned
#saveRDS(coordsmedian.parietal_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal_cleaned.RDS")

coordsmedian.parietalfiltered_cleaned <- coordsmedian.parietal_cleaned %>% # filter coordsmedian.parietal_cleaned for the both categories of the parietal eye to exclude all others
  filter(parietal_eye == 1 | parietal_eye == 0)

#saveRDS(coordsmedian.parietalfiltered_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietalfiltered_cleaned")

5.2. Geographical world map (cleaned):

At this point, the data is again plotted on a world map with information about the state of the parietal eye as a function of latitude.

library(ggplot2)

coordsmedian.parietalfiltered_cleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietalfiltered_cleaned")

# plot median geographical distribution by state of parietal eye - non interactive world map
coordsmedian.parietalfiltered_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(!is.na(parietal_eye) & parietal_eye %in% c("0", "1")) # excluding rows with NA
world_map_cleaned <- map_data("world") # load data for world map
point_colors <- c("#FF0000", "#99CC00") # assigning the colors to the two categories
ggplot() +
  geom_polygon(data = world_map_cleaned, aes(x = long, y = lat, group = group), fill = "#99CCFF", color = "#00FFCC") + # plot world map
  geom_point(data = coordsmedian.parietalfiltered_cleaned, aes(x = medianLon, y = medianLat, color = factor(parietal_eye)), # plot median geographical coordinates
             shape = 1,  
             size = 0.5) +  
  xlab("Longitude") +
  ylab("Latitude") +
  coord_fixed() +
  theme_bw() +
  labs(color = "State of parietal eye:") +
  ggtitle(expression(bold("Median geographical distribution by parietal eye (cleaned data):"))) + # Adjust title
  scale_color_manual(values = c("#FF0000", "#99CC00"), breaks = c("0", "1"), # two categorical states assigned to colors
                     labels = c("absent", "present"), 
                     guide = guide_legend(override.aes = list(shape = 1, size = 3))) +  # Adjust legend
  theme(legend.key.size = unit(0.3, "cm"),
        plot.title = element_text(size = rel(1), face = "bold")) 

5.3. Latitudinal distribution in dpendence to families parietal eye state:

This graph shows the status of the parietal eye in families with at least 10 analysed species in relation to the latitudinal distribution. Only families with a constant parietal eye state of 0 or 100% are included in the statistics.`

library(dplyr)

coordsmedian_cleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian_cleaned.RDS")
filtered_data<- readRDS("C:/Users/Daniel/Desktop/results_R/filtered_data.RDS")

filtered_data$medianLat<- NA # add new column named "medianLat"

# filter data for matching rows and ad information about "medianLat" for the species
for (i in 1:nrow(filtered_data)) {
  taxon <- filtered_data$Taxon[i]
  matching_rows <- grep(paste(unlist(strsplit(taxon, "\\s+")), collapse=".*"), coordsmedian_cleaned$scientificName)
  if (length(matching_rows) > 0) {
    matching_row <- coordsmedian_cleaned[matching_rows[1], ]
    filtered_data$medianLat[i] <- matching_row$medianLat
  }
}


# only keep families with 10 or more sampled species and 0% or 100% parietal eye occurence
keep_families <- c("Amphisbaenidae", "Diplodactylidae", "Gekkonidae", "Gymnophthalmidae", "Phyllodactylidae", "Sphaerodactylidae", "Teiidae", "Anguidae", "Anolidae", "Leiosauridae", "Liolaemidae", "Phrynosomatidae")

# Filter families
filtered_data <- filtered_data %>%
  filter(Family %in% keep_families)

# Calculate mean medianLat per family
mean_medianLat <- filtered_data %>%
  group_by(Family) %>%
  summarize(mean_medianLat = mean(medianLat, na.rm = TRUE))

# Merge Parietal Eye data
mean_medianLat <- merge(mean_medianLat, summary_table, by = "Family")

# Create new column parietal_eye
mean_medianLat$parietal_eye <- ifelse(mean_medianLat$Present == 100, 1, 0)

# Calculate positive and negative means for green (present) points
mean_green_positive <- mean(mean_medianLat$mean_medianLat[mean_medianLat$parietal_eye == 1 & mean_medianLat$mean_medianLat > 0], na.rm = TRUE)
mean_green_negative <- mean(mean_medianLat$mean_medianLat[mean_medianLat$parietal_eye == 1 & mean_medianLat$mean_medianLat < 0], na.rm = TRUE)

# Calculate positive and negative means for red (absent) points
mean_red_positive <- mean(mean_medianLat$mean_medianLat[mean_medianLat$parietal_eye == 0 & mean_medianLat$mean_medianLat > 0], na.rm = TRUE)
mean_red_negative <- mean(mean_medianLat$mean_medianLat[mean_medianLat$parietal_eye == 0 & mean_medianLat$mean_medianLat < 0], na.rm = TRUE)

ggplot(mean_medianLat, aes(x = Family, y = mean_medianLat, color = factor(parietal_eye))) +
  geom_point(size = 3) +
  geom_text(aes(label = round(mean_medianLat, 2)), hjust = -0.2, vjust = -0.4, size = 3, fontface = "plain") + 
  scale_color_manual(values = c("#FF0000", "#99CC00"), 
                     labels = c("0"= "absent", "1" = "present")) +  # adjust legend labels
  labs(x = "Family", y = "Median Lat", color = "Parietal eye:") +
  geom_hline(yintercept = c(mean_green_positive, mean_green_negative), color = "#99CC00", linetype = "solid") + # add mean line for green positives
  geom_hline(yintercept = c(mean_red_positive, mean_red_negative), color = "#FF0000", linetype = "solid") + # add mean line for red negatives
  annotate("text", y = mean_green_positive, x = 1, label = paste(round(mean_green_positive, 2)), hjust = 0.8, vjust = -0.6, size = 3, fontface = "plain") +
  annotate("text", y = mean_green_negative, x = 1, label = paste(round(mean_green_negative, 2)), hjust = 0.8, vjust = -0.6, size = 3, fontface = "plain") +
  annotate("text", y = mean_red_positive, x = 1, label = paste(round(mean_red_positive, 2)), hjust = 0.8, vjust = -0.6, size = 3, fontface = "plain") +
  annotate("text", y = mean_red_negative, x = 1, label = paste(round(mean_red_negative, 2)), hjust = 0.8, vjust = -0.6, size = 3, fontface = "plain") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), # adjust x-axis labels
        plot.title = element_text(face = "bold", size = 11),  # make the plot title bold
        legend.title = element_text(face = "bold")) +  # make the legend title bold
  ggtitle("Parietal eye state in dependence to families' median latitude:") +
  guides(color = guide_legend(override.aes = list(label = "")))  # correct the legend labels

6.1. Interactive world map (cleaned):

Although the distribution of parietal eye states as a function of latitude gives a first inside, it is not possible to assign the points to species. For this reason, an interactive world map with a drop-down menu and tooltip has been created, which gives a closer inset for each plotted point.

library(usethis)
library(devtools)
library(shiny)
library(leaflet)

# plot median geographical distribution by state of parietal eye -interactive world map with drop down menu and tooltip ("scientificName", "medianLat", "meanLon" and "parietal_eye")

coordsmedian.parietalfiltered_cleaned$point_colors <-  ifelse(coordsmedian.parietalfiltered_cleaned$parietal_eye == 1, "#99CC00", "#FF0000") # assign colors
# create Tooltip with information of the columns "scientificName", "medianLat", "medianLon" and "eye_status"
coordsmedian.parietalfiltered_cleaned$tooltip <- paste("Species:", "<i>", coordsmedian.parietalfiltered_cleaned$scientificName, "</i>",
                                                     "<br>Parietal Eye:", ifelse(coordsmedian.parietalfiltered_cleaned$parietal_eye == 1, "present", "absent"),
                                                     "<br>Median Latitude:", coordsmedian.parietalfiltered_cleaned$medianLat,
                                                     "<br>Median Longitude:", coordsmedian.parietalfiltered_cleaned$medianLon) # creating the three parameters of the Tooltip
# Define the UI
ui <- fluidPage(
  titlePanel("State of parietal eye in dependency to median geographical distribution (cleaned):"),  #add the title to the graphic
  fluidRow(
    column(3,  
           selectizeInput("selectScientificName", "Select Scientific Names",  # creating the drop down menu with its scientific names
                          choices = c("All", unique(gsub("\\(.?\\)|\\[.?\\]", "", coordsmedian.parietalfiltered_cleaned$scientificName))),  # optional choices of the drop down menu
                          multiple = TRUE,  # allow to select several scientific names
                          selected = "All"  # option of selecting all species
           )
    ),
    column(9,  
           leafletOutput("map")  
    )
  )
)

# Define the server logic
server <- function(input, output, session) {
  filtered_data_cleaned <- reactive({
    if ("All" %in% input$selectScientificName) {  
      return(coordsmedian.parietalfiltered_cleaned)  
    } else {
      return(coordsmedian.parietalfiltered_cleaned[coordsmedian.parietalfiltered_cleaned$scientificName %in% input$selectScientificName, ])  
    }
  }) # returning the selected scientific names within the drop down menu in the leaflet map
  
  # Render the leaflet map
  output$map <- renderLeaflet({
    leaflet() %>%  # Initialization
      addProviderTiles("OpenStreetMap.Mapnik") %>%  
      addCircleMarkers(data = filtered_data_cleaned(),  
                       lng = ~medianLon,  
                       lat = ~medianLat,  
                       color = ~point_colors,  
                       radius = 3,  
                       fillOpacity = 1,  
                       popup = ~tooltip) %>%   #Adjust the plot
      addLegend("bottomleft",  # Adjust the legend
                colors = c("#FF0000", "#99CC00"),  
                labels = c("absent", "present"),  
                title = "State of parietal eye")  
  })
}
shinyApp(ui = ui, server = server) # Run the Shiny app
# get access to interactive world map by: Listening on http://127.0.0.1:6064

By clicking on the following link the interactive world map can be opend.

7.0. Statistical analysis of latitude (cleaned):

Statistical analysis of the cleaned data is also essential for further work and to gain insight into the interplay between squamate parietal eye condition and latitudinal distribution.

7.1.2. Statistics of North Hemisphere (cleaned):

library(ggplot2)

# statistics of parietal eye state in dependency of latitudinal distribution:
coordsmediannorth_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(medianLat >0 ) # Filter data for North Hemisphere

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediannorth_cleaned %>% filter(medianLat > 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # Median value
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (North Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + 
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.2.2. Statistics of North Hemisphere for coordinates in America (cleaned):

Again, the interplay is also calculated for American north hemisphere.

library(ggplot2)

coordsmediannorth_america_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(medianLon >= -125 & medianLon <= -30 & medianLat > 0) # Filter data between -30° and -125° longitude for North hemisphere of America

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediannorth_america_cleaned %>% filter(medianLat > 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate the median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (American North Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend labels
  labs(fill = "Parietal Eye:") + # Adjust legend title
  theme(plot.title = element_text(size = 11, face = "bold"), # Increase title size
        legend.position = "right",
        legend.title = element_text(size = 12), # Increase legend title size
        legend.text = element_text(size = 12)) # Increase legend text size

7.3.2. Statistics of North Hemisphere for all coordinates except America (cleaned):

Here the counterpart of the American North Hemisphere is calculated by looking at the rest of the North Hemisphere coordinates despite those of America.

library(ggplot2)

coordsmediannorth_not_america_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(!(medianLon >= -125 & medianLon <= -30) & medianLat > 0) # Filter data despite -30° and -125° longitude to exclude North Hemisphere of America

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediannorth_not_america_cleaned, aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate the median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (Not American North Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + 
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.4.2. Statistics of South Hemisphere (cleaned):

Here is the graphic that represents the cleaned data of the South Hemisphere of the whole globe.

library(ggplot2)

coordsmediansouth_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(medianLat < 0 ) # Filter data for South Hemisphere

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediansouth_cleaned %>% filter(medianLat < 0), aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate the median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (South Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") +
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.5.2. Statistics of South Hemisphere for coordinates in America (cleaned):

In this part the statistics of American´s southern hemisphere are calculated

library(ggplot2)

coordsmediansouth_america_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(medianLon >= -125 & medianLon <= -30 & medianLat < 0) # filtere coordinates for south hemisphere of America

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediansouth_america_uncleaned, aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate the median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (American South Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + 
  theme(plot.title = element_text(size = 11, face = "bold"),
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.6.2. Statistics of south hemisphere for all coordinates except America (uncleaned):

Furthermore, the counterpart of the American southern hemisphere coordinates is calculated by looking at the Old World squamates.

library(ggplot2)

coordsmediansouth_not_america_cleaned <- coordsmedian.parietalfiltered_cleaned %>%
  filter(!(medianLon >= -125 & medianLon <= -30) & medianLat < 0) # filter coordinates despite the South Hemisphere of America

# Create boxplots with mean, 25- and 75-quantiles
ggplot(coordsmediansouth_not_america_cleaned, aes(x = factor(parietal_eye), y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 0.75) + 
  stat_summary(fun = median, geom = "text", aes(label = round(..y.., 2)), vjust = 1, size = 3, color = "#000000") + # calculate the median
  stat_summary(fun = function(x) quantile(x, 0.25), geom = "text", aes(label = round(..y.., 2)), vjust = 1.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 25-quantile
  stat_summary(fun = function(x) quantile(x, 0.75), geom = "text", aes(label = round(..y.., 2)), vjust = -0.5, hjust = 1.1, size = 3, color = "#3399CC") + # calculate the 75-quantile
  xlab("Parietal eye") +
  ylab("Median latitude") +
  ggtitle("Median latitudinal distribution (Not American South Hemisphere cleaned):") + # Adjust title
  theme_bw() +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend 
  labs(fill = "Parietal eye:") + 
  theme(plot.title = element_text(size = 11, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 12), 
        legend.text = element_text(size = 12)) 

7.7. combinde boxplot of all hemispheres:

This boxplot graph contains all the cleaned boxplots. This gives a better overview of the effect of different regions on the effect on parietal eye state.

all_data <- bind_rows(
  coordsmediannorth_cleaned %>% mutate(Location = "North"),
  coordsmediannorth_america_cleaned %>% mutate(Location = "North (America)"),
  coordsmediannorth_not_america_cleaned %>% mutate(Location = "North (not America"),
  coordsmediansouth_cleaned %>% mutate(Location = "South"),
  coordsmediansouth_america_cleaned %>% mutate(Location = "South (America)"),
  coordsmediansouth_not_america_cleaned %>% mutate(Location = "South (not America)") # create the combined list for the cleaned boxplots
)

# create the combined boxplot
plot_combined <- ggplot(all_data, aes(x = Location, y = medianLat, fill = factor(parietal_eye))) +
  geom_boxplot(width = 1) +
  stat_summary(data = subset(all_data, parietal_eye == 1), fun = median, geom = "text", aes(x = Location, label = round(..y.., 2)), vjust = -0.8, hjust= -0.1, size = 2.4, color = "#000000") + # calculate median for category 1
  stat_summary(data = subset(all_data, parietal_eye == 0), fun = median, geom = "text", aes(x = Location, label = round(..y.., 2)), vjust = -0.8, hjust= 1.1, size = 2.4, color = "#000000") + # calculate median for category 0
  ylab("Mean Latitude") +
  xlab("Hemisphere") +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust legend  
  labs(fill = "Parietal eye:") +
  ggtitle("Median latitudinal distribution by Hemisphers, New- and Old World Squamates:") +
  theme_bw() +
  theme(plot.title = element_text(size = 9.5, face = "bold"), 
        legend.position = "right",
        legend.title = element_text(size = 9), 
        axis.text.x = element_text(angle = 45, hjust = 1))
print(plot_combined)

8.0. t-tests of cleaned data:

As with the unpurified data above, here is the purified data plotted above and tested with a t-test. For the statistical tests, the specific selected lists are used according to the region of the world analysed.

8.1.2. t-test of North Hemisphere (cleaned):

t_test_northhemisphere_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_cleaned)
print(t_test_northhemisphere_cleaned)

8.2.2. t-test of North Hemisphere for coordinates of America (cleaned):

t_test_northhemisphere_america_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_america_cleaned)
print(t_test_northhemisphere_america_cleaned)

8.3.2. t-test of North Hemisphere for coordinates except America (cleaned):

t_test_northhemisphere_not_america_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediannorth_not_america_cleaned)
print(t_test_northhemisphere_not_america_cleaned)

8.4.2. t-test of South Hemisphere (cleaned):

t_test_southhemisphere_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_cleaned)
print(t_test_southhemisphere_cleaned)

8.5.2. t-test of South Hemisphere for coordinates of America (cleaned):

t_test_southhemisphere_america_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_america_cleaned)
print(t_test_southhemisphere_america_cleaned)

8.6.2. t-test of South Hemisphere for coordinates except America (cleaned):

t_test_southhemisphere_not_america_cleaned <- t.test(medianLat ~ parietal_eye, data = coordsmediansouth_not_america_cleaned)
print(t_test_southhemisphere_not_america_cleaned)

9.0. Parietal eye in dependence to circadian rhythm (cleaned):

Beside the dependency of the latitude to the state of the parietal eye, also the circadian rhythm is analysed. Therefore the researched data is used. There are three categories of circadian rhythm which are abbreviated by their first letter (nocturnal, diurnal, cathemeral)

9.1. Circadian rhythm of species (cleaned):

At first a list with information about the coordinates distribution of each species as well as their circadian rhythm is created. Additionally the rows that have no information in the excel data about the circadian rhythm are also excluded in the list “coordsmedian.parietal.cirica_cleaned”.

coordsmedian.parietal.cirica_cleaned <- coordsmedian.parietalfiltered_cleaned
coordsmedian.parietal.cirica_cleaned$circadian_rhythm <- NA # create new column 

for (i in 1:nrow(squamata_Amph.Lac.Serp_xl)) { # loop through "squamata_Amph.Lac.Serp_xl" and add the information to  "coordsmedian.parietal.cirica_cleaned" 
  match_row <- which(sapply(strsplit(coordsmedian.parietal.cirica_cleaned$scientificName, " "), 
                            function(x) any(x %in% strsplit(squamata_Amph.Lac.Serp_xl$Taxon[i], " ")[[1]]))) # searching for matching rows in both data
  if (length(match_row) > 0) {
    coordsmedian.parietal.cirica_cleaned$circadian_rhythm[match_row] <- squamata_Amph.Lac.Serp_xl$circadian_rhythm[i] # add information to the column
  }
}
coordsmedian.parietal.ciricafiltered_cleaned <- coordsmedian.parietal.cirica_cleaned[coordsmedian.parietal.cirica_cleaned$circadian_rhythm %in% c("c", "n", "d"), ] # filter data and exclude rows with "NA"

#saveRDS(coordsmedian.parietal.ciricafiltered_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.ciricafiltered_cleaned.RDS")

10.1. Plot of absolute circadian rhythm (cleaned):

With the list plotted above a graphical display of the data, which show the dependence of the circadian rhythm to the parietal eye state, can be made.

ggplot(coordsmedian.parietal.ciricafiltered_cleaned, aes(x = factor(circadian_rhythm), fill = factor(parietal_eye))) + # assigning the color factor
  geom_bar(position = "stack") +
  labs(title = "Distribution of parietal eye by circadian rhythm:",  # Making title bold
       x = "Circadian rhythm",
       y = "Count",
       fill = "Parietal eye:") +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) +  # Adjust labels by colors
  theme(plot.title = element_text(face = "bold"))  

10.2. Plot of relative circadian rhythm (cleaned):

Furthermore a graphic showing a percentage presentation is also important.

# Calculate the proportions of the three categories
proportions <- coordsmedian.parietal.ciricafiltered_cleaned %>%
  group_by(parietal_eye, circadian_rhythm) %>%
  summarise(count = n()) %>%
  mutate(percent = count / sum(count))
# Plot with ggplot() the percentual circadian rhythm
ggplot(proportions, aes(x = circadian_rhythm, y = percent, fill = factor(parietal_eye))) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Distribution of parietal eye by circadian rhythm:",
       x = "Circadian rhythm",
       y = "Proportion",
       fill = "Parietal eye:") +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present"))+ # Adjust labels by color
  theme(plot.title = element_text(face = "bold"))  

10.3. Plot of both displays (cleaned):

To avoid giving the impression that there is a clear tendency for cathemeral squamates to be diurnal, the absolute numbers have to be added to the graph.

library(ggplot2)
library(dplyr)
coordsmedian.parietal.ciricafiltered_cleaned <- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.ciricafiltered_cleaned.RDS")
# Calculate the proportions of the three categories
proportions <- coordsmedian.parietal.ciricafiltered_cleaned %>%
  group_by(parietal_eye, circadian_rhythm) %>%
  summarise(count = n()) %>%
  mutate(percent = count / sum(count))
`summarise()` has grouped output by 'parietal_eye'. You can override using the
`.groups` argument.
# Plot percentual circadian rhythm with ggplot()
ggplot(proportions, aes(x = circadian_rhythm, y = percent, fill = factor(parietal_eye))) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(label = count), position = position_fill(vjust = 0.5), size = 3) +  # Add the absolute numbers
  labs(title = "Distribution of parietal eye by circadian rhythm:",
       x = "Circadian Rhythm",
       y = "Proportion",
       fill = "Parietal eye:") +
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present")) + # Adjust labels by colors
  theme(plot.title = element_text(face = "bold")) 

11.0. Statistical analysis of circadian rhythm (cleaned):

As proceeded above with the median latitude also the statistics for the circadian rhythm in dependence to the parietal eye state has to be calculated.

11.1. Chi-square test of circadian rhythm (cleaned):

chisq_circadianrhythm <-chisq.test(coordsmedian.parietal.ciricafiltered_cleaned$parietal_eye, coordsmedian.parietal.ciricafiltered_cleaned$circadian_rhythm)
chisq_circadianrhythm

11.2. Fishers exact test of circadian rhythm (cleaned):

fisher_test_circadianrhythm <- fisher.test(table(coordsmedian.parietal.ciricafiltered_cleaned$parietal_eye, coordsmedian.parietal.ciricafiltered_cleaned$circadian_rhythm))
print(fisher_test_circadianrhythm)

In both cases the chi square and the fishers exact test are highly significant results presented.

12.0. Parietal eye in dependence to habitat (cleaned):

The final analysed factor towards the parietal eye state is the habitat of the squamates.

12.1. Habitat of species (cleaned):

To analyse the habitat in dependence to the habitat, the researched habitats are assigned towards the species.

#create a new column in list "coordsmedian.parietal.habitat_cleaned" and fill it with data out of "squamata_Amph.Lac.Serp_xl$habitat_abb"
coordsmedian.parietal.habitat_cleaned<- coordsmedian.parietalfiltered_cleaned
coordsmedian.parietal.habitat_cleaned$habitat_abb<- NA # assign new column
for (i in 1:nrow(squamata_Amph.Lac.Serp_xl)) { # loop through "squamata_Amph.Lac.Serp_xl"
  match_row <- which(sapply(strsplit(coordsmedian.parietal.habitat_cleaned$scientificName, " "), function(x) any(x %in% strsplit(squamata_Amph.Lac.Serp_xl$Taxon[i], " ")[[1]]))) # Find matching rows of"coordsmedian.parietal.ciricafiltered.habitat_cleaned" and "squamata_Amph.Lac.Serp_xl$Taxon"
  if (length(match_row) > 0) {
    coordsmedian.parietal.habitat_cleaned$habitat_abb[match_row] <- squamata_Amph.Lac.Serp_xl$habitat_abb[i] # add information of "squamata_Amph.Lac.Serp_xl$habitat_abb" to "coordsmedian.parietal.cirica_cleaned$habitat_abb" if it matches
  }
}
coordsmedian.parietal.habitat_cleaned
#saveRDS(coordsmedian.parietal.habitat_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.habitat_cleaned.RDS")

13.1. Habitat categories of species (cleaned):

Due to the large number of different habitats, it is necessary to categorise them. There are several ways to categorise them. One of them is the IUCN Red List categorisation shown here. The allocation of the different habitat categories is shown below. The longevity of the habitats studied can be seen in the thesis.

#create a new column called "coordsmean.parietal.habitat_cleaned$habitat_category" that puts "squamata_Amph.Lac.Serp_xl$habitat_abb" in diffrent categories of IUCN red list

coordsmedian.parietal.habitat_cleaned$habitat_category <- NA # create a new column
habitat_mapping <- c(
  "ba" = "Marine Coastal/Supratidal",
  "be" = "Marine Coastal/Supratidal",
  "bj" = "Rocky Areas",
  "cl" = "Forest",
  "co" = "Marine Coastal/Supratidal",
  "de" = "Wetland",
  "dt" = "Desert",
  "du" = "Desert",
  "dv" = "Desert",
  "fo" = "Forest",
  "fw" = "Wetland",
  "gl" = "Grassland",
  "he" = "Forest",
  "jw" = "Forest",
  "la" = "Wetland",
  "lg" = "Wetland",
  "ls" = "Rocky Areas",
  "ma" = "Wetland",
  "mf" = "Rocky Areas",
  "mn" = "Rocky Areas",
  "mo" = "Rocky Areas",
  "mp" = "Rocky Areas",
  "mr" = "Rocky Areas",
  "pa" = "Grassland",
  "pr" = "Grassland",
  "ri" = "Wetland",
  "rm" = "Wetland",
  "sd" = "Desert",
  "sf" = "Forest",
  "sp" = "Grassland",
  "ty" = "Wetland",
  "un" = "Forest",
  "vr" = "Rocky Areas",
  "wa" = "Wetland",
  "wc" = "Forest"
) # assign new categories
coordsmedian.parietal.habitat_cleaned$habitat_category <- habitat_mapping[as.character(coordsmedian.parietal.habitat_cleaned$habitat_abb)]
coordsmedian.parietal.habitat_cleaned
#saveRDS(coordsmedian.parietal.habitat_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.habitat_cleaned.RDS")

14.1. Filter habitat of species (cleaned):

As proceeded before, rows without information of habitats have to be excluded from the list.

# create a new list in which only these rows are included, saving  information of the "habitat_category":
coordsmedian.parietal.habitatfiltered_cleaned<- coordsmedian.parietal.habitat_cleaned
coordsmedian.parietal.habitatfiltered_cleaned<- coordsmedian.parietal.habitatfiltered_cleaned[!is.na(coordsmedian.parietal.habitatfiltered_cleaned$habitat_category), ] # filter data and exclude these with NA

coordsmedian.parietal.habitatfiltered_cleaned$parietal_eye_filtered <- ifelse(is.na(coordsmedian.parietal.habitatfiltered_cleaned$habitat_category), NA,                                                                coordsmedian.parietal.habitatfiltered_cleaned$parietal_eye)

coordsmedian.parietal.habitatfiltered_cleaned

#saveRDS(coordsmedian.parietal.habitatfiltered_cleaned, file="C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.habitatfiltered_cleaned.RDS")

#coordsmedian.parietal.habitatfiltered_cleaned<- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.habitatfiltered_cleaned.RDS")

15.1. Plot of absolute habitat categories (cleaned):

Moreover the data for the habitat categories in dependence to the parietal eye state are displayed graphically.

# absolute plot of dependency of parietal eye in relation to habitat categorie of IUCN red list:

ggplot(coordsmedian.parietal.habitatfiltered_cleaned, aes(x = habitat_category, fill = factor(parietal_eye_filtered, labels = c("absent", "present")))) +
  geom_bar(position = "stack", stat = "count") +
  geom_text(stat = 'count', aes(label = ..count..), position = position_stack(vjust = +0.5), size = 2.5, color = "#000000") +  
  labs(title = "Absolute distribution of parietal eye by habitat according to IUCN Red list:",
       x = "Habitat category",
       y = "Count",
       fill = "Parietal eye:") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.85), 
        plot.title = element_text(size = 10, face = "bold"), 
        axis.title = element_text(size = 10),  
        axis.text = element_text(size = 8)) +  
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present"))  

15.2. Plot of percentage presentation of habitat categories (cleaned) :

For the same data there is also a percentage presentation of the habitat distribution.

coordsmedian.parietal.habitatfiltered_cleaned<- readRDS("C:/Users/Daniel/Desktop/results_R/coordsmedian.parietal.habitatfiltered_cleaned.RDS")
library(ggplot2)
# percentage presentation of parietal eye in dependence to habitat categories of IUCN Red list:
ggplot(coordsmedian.parietal.habitatfiltered_cleaned, aes(x = habitat_category, fill = factor(parietal_eye, labels = c("Absent", "Present")))) +
  geom_bar(position = "fill") +  
  geom_text(stat='count', aes(label=..count..), position = position_fill(vjust = 0.5), size = 2.5) +  # adjust geom-text
  labs(title = "Procentual distribution of parietal eye by habitat according to IUCN Red list:",
       x = "Habitat category",
       y = "Proportion",  
       fill = "Parietal eye:") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.85),  # adjust axis
        plot.title = element_text(size = 10, face = "bold"),  
        axis.title = element_text(size = 10),  
        axis.text = element_text(size = 8))+  
  scale_fill_manual(values = c("#FF0000", "#99CC00"), labels = c("absent", "present"))  # assign bicategorical filling colors

16.1. Statistical analysis of habitat categories (cleaned):

A statistical display of the interplay between the habitat and the parietal eye state is also given down below

# chi-square for "parietal_eye_filtered" in dependence of "habitat_category":
chisq_test_habitat <- chisq.test(coordsmedian.parietal.habitatfiltered_cleaned$parietal_eye_filtered, coordsmedian.parietal.habitatfiltered_cleaned$habitat_category)
chisq_test_habitat

The Chi-square test gives a highly significant result between the habitat of the species and the condition of the parietal eye.

17.0. Phylogenetic tree (cleaned):

Due to the fact that closely related species share a lot of phylogenetic history, character traits, or external factors such as parietal eye status or latitudinal distribution, species cannot be viewed independently. Therefore, phylogenetic structures must be taken into account.

17.1. Phylogenetic tree of squamata of the whole world:

Here is a display of the phylogenetic tree of all sampled squamates. Information on branch lengths, nodes and the overall topology of the tree is stored in Zheng’s phylogenetic tree

library(ape)
## 
## Attache Paket: 'ape'
## Das folgende Objekt ist maskiert 'package:dplyr':
## 
##     where
library(phytools)

# creating a new species_vector with names for the data of the phylogenetic tree of Zheng
species_vector_phyl <- gsub(" ", "_", species_vector)

tree_Zheng <- read.tree(file = "C:/Users/Daniel/Desktop/read in data/Zheng_Wiens.txt") # read in Zheng´s tree
matching_species <- intersect(species_vector_phyl, tree_Zheng$tip.label) #searching for matching species 
tree_squamata <- drop.tip(tree_Zheng, setdiff(tree_Zheng$tip.label, matching_species)) # excluding all species that do not mach both data structures


tip_colors <- rep("#FF0000", length(tree_squamata$tip.label))  # creating a default vector (red) for the parietal eye state
for (i in seq_along(tree_squamata$tip.label)) {
  tree_name <- gsub("_", " ", tree_squamata$tip.label[i])  # substitute the "_" by a space
  matching_rows <- grep(paste(unlist(strsplit(tree_name, " ")), collapse = "|"), coordsmedian.parietalfiltered_cleaned$scientificName, ignore.case = TRUE) # searching for matching species in "coordsmedian.parietalfiltered_cleaned$scientificName"
  if (length(matching_rows) > 0) {
    eye_status <- coordsmedian.parietalfiltered_cleaned$parietal_eye[matching_rows]
    tip_colors[i] <- ifelse(any(eye_status == 1), "#99CC00", "#FF0000")  # color the tip in dependence to the state of the "parietal_eye" 
  }
}
#write.tree(tree_squamata, file = "C:/Users/Daniel/Desktop/results_R/tree_squamata.txt")

pdf("C:/Users/Daniel/Desktop/results_R/tree_squamata.pdf", width = 20, height = 20) # save the phylogenetic tree as a pdf with state of parietal eye as colored tips

plot(tree_squamata, show.tip.label = TRUE, cex = 0.25, main = "Phylogenetic Tree of sampled Squamates with state of parietal eye:", cex.main = 1) # add title
legend("topleft", title = "State of Parietal Eye:", legend = c("absent", "present"), fill = c("#FF0000", "#99CC00"), cex = 0.8, inset = c(0.01, 0.01)) # add legend
tiplabels(pch = 21, bg = tip_colors, cex = 0.6, adj = c(-0.5, 0.5)) # adjust tiplabels
dev.off()
## png 
##   2

Because of the enormous amount of tip species, the tree can not be presented here in its details. The pedigree therefore is delivered in an additional pdf data.

17.2. Phylogenetic tree of squamata of American longitude (-125° to -30°):

There is also a display of the phylogenetic structure for the American squamates because, as can be seen above, their dependence on latitude is different.

matching_species_america <- subset(coordsmedian.parietalfiltered_uncleaned, medianLon >= -125 & medianLon <= -30 & medianLat > 0)$scientificName # filter by excluding american distributed species

matching_species_america <- unique(sapply(matching_species_america, function(x) paste(strsplit(x, " ")[[1]][1:2], collapse = "_"))) # extracting the species name while excluding authors name and year

species_vector_phyl_america <- intersect(species_vector_phyl, matching_species_america) # selectin American species
matching_species_america <- intersect(species_vector_phyl_america, tree_Zheng$tip.label) # searching for matching species


tree_squamata_america <- drop.tip(tree_Zheng, setdiff(tree_Zheng$tip.label, matching_species_america)) # excluding not matching species
tip_colors <- rep("#FF0000", length(tree_squamata_america$tip.label))  # creating a default vector (red) for the parietal eye state
for (i in seq_along(tree_squamata_america$tip.label)) {
  tree_name <- gsub("_", " ", tree_squamata_america$tip.label[i])  # substitute the "_" by a space
  matching_rows <- grep(paste(unlist(strsplit(tree_name, " ")), collapse = "|"), coordsmedian.parietalfiltered_cleaned$scientificName, ignore.case = TRUE) # searching for maching species in "coordsmedian.parietalfiltered_cleaned$scientificName"
  if (length(matching_rows) > 0) {
    eye_status <- coordsmedian.parietalfiltered_cleaned$parietal_eye[matching_rows]
    
    tip_colors[i] <- ifelse(any(eye_status == 1), "#99CC00", "#FF0000")  # # color the tip labels based on the two categories of "eye_status" 
  }
}

#write.tree(tree_squamata_america, file = "C:/Users/Daniel/Desktop/results_R/tree_squamata_america.txt")

#pdf("C:/Users/Daniel/Desktop/results_R/tree_squamata_america.pdf", width = 20, height = 20) # save the phylogenetic tree as a pdf with state of parietal eye as coloured tips

plot(tree_squamata_america, show.tip.label = TRUE, cex = 0.25, main = "Phylogenetic Tree of sampled Squamates within America (longitude from -125° to -30°) with state of parietal eye:", cex.main = 0.7) # add title
legend("topleft", title = "State of Parietal Eye:", legend = c("Absent", "Present"), fill = c("#FF0000", "#99CC00"), cex = 0.8, inset = c(0.01, 0.01)) # add legend
tiplabels(pch = 21, bg = tip_colors, cex = 0.6, adj = c(-0.5, 0.5)) # adjust tiplabels

dev.off()
## null device 
##           1

Same accords to this tree, which is also added separately as a pdf data to the thesis.

17.3. Phylogenetic tree of squamata in dependenc towards circadian rhythm:

matching_species_circadian <- coordsmedian.parietal.ciricafiltered_cleaned$scientificName[coordsmedian.parietal.ciricafiltered_cleaned$circadian_rhythm %in% c("n", "d")] # extract only species names with "d" or "n" in the following column, therefore exclude those with "c"

matching_species_circadian <- sapply(strsplit(matching_species_circadian, " "), function(x) paste(x[1:2], collapse = "_")) #create vector "matching_species_circadian" by "sapply"

matching_species_circadian <- intersect(matching_species_circadian, tree_Zheng$tip.label) # search for matching species in Zheng´s tree


tree_squamata_circadian <- drop.tip(tree_Zheng, setdiff(tree_Zheng$tip.label, matching_species_circadian))
matching_species_circadian <- gsub("_", " ", matching_species_circadian)

#write.tree(tree_squamata_circadian, file = "C:/Users/Daniel/Desktop/results_R/tree_circadian.txt")

17.4. Phylogenetic tree of squamata in dependenc to habitat:

matching_species_habitat <- sapply(strsplit(coordsmedian.parietal.habitatfiltered_cleaned$scientificName, " "), function(x) paste(x[1:2], collapse = "_")) # create a vector with species having information saved about their habitat

matching_species_habitat <- intersect(matching_species_habitat, tree_Zheng$tip.label)  #matching species with Zheng´s tree



tree_squamata_habitat <- drop.tip(tree_Zheng, setdiff(tree_Zheng$tip.label, matching_species_habitat)) # create tree for species with habitat information

matching_species_circadian <- gsub("_", " ", matching_species_habitat) # substitute space

#write.tree(tree_squamata_habitat, file = "C:/Users/Daniel/Desktop/results_R/tree_habitat.txt")

In later work, the threshold will be calculated using the threshBayes function. The prerequisite for this model is Brownian motion. Therefore, the phylogenetic tree must be tested for Brownian motion. Lambda can have values between 0 and 1, where 1 means that the analysed tree follows Brownian motion. To get an even more precise result with “threshBayes”, the tree can be modified in its branch lengths with the calculated value.

18.0. Pagel`s Lambda (cleaned):

library(geiger)

lambda_data <- data.frame(row.names = matching_species) # create a data frame out of the matching species that are the rownames
rownames(lambda_data)<-gsub("_", " ", rownames(lambda_data)) #substitution
lambda_data$medianLat <- NA # create a new column
for (i in seq_len(nrow(lambda_data))) {
  name <- rownames(lambda_data)[i]
  matching_rows <- grep(name, coordsmedian.parietalfiltered_cleaned$scientificName) # searching for matching rows
  if (length(matching_rows) > 0) {
    lambda_data$medianLat[i] <- coordsmedian.parietalfiltered_cleaned$medianLat[matching_rows[1]] # add information about median latitude to the new assigend column
  }
}
# Because of unequal scientific names in two cases the data of them have to be added manually
lambda_data["Malayotyphlops_ruber", "medianLat"] <- 12.08537
lambda_data["Hemidactylus_imbricatus", "medianLat"] <- -29.1

rownames(lambda_data)<-gsub(" ", "_", rownames(lambda_data))# substitution

#saveRDS(lambda_data, file= "C:/Users/Daniel/Desktop/results_R/lambda_data.RDS")
#lambda_data<-readRDS("C:/Users/Daniel/Desktop/results_R/lambda_data.RDS")

With these results lambda can be calculated.

library(geiger)
## Warning: Paket 'geiger' wurde unter R Version 4.3.3 erstellt
lambda_data <- readRDS("C:/Users/Daniel/Desktop/results_R/lambda_data.RDS")

results_geiger_lambda_data <- fitContinuous(phy = tree_squamata, dat = lambda_data)
results_geiger_lambda_data
## GEIGER-fitted comparative model of continuous data
##  fitted 'BM' model parameters:
##  sigsq = 3.683858
##  z0 = 5.083600
## 
##  model summary:
##  log-likelihood = -1690.090886
##  AIC = 3384.181772
##  AICc = 3384.210898
##  free parameters = 2
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 100
##  frequency of best fit = 1.000
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates
#saveRDS(results_geiger_lambda_data, file= "C:/Users/Daniel/Desktop/results_R/results_geiger_lambda_data.RDS")
library(geiger)
fitContinuous(phy= tree_squamata, dat= lambda_data, model= "lambda")
## GEIGER-fitted comparative model of continuous data
##  fitted 'lambda' model parameters:
##  lambda = 0.981244
##  sigsq = 3.290608
##  z0 = 5.065844
## 
##  model summary:
##  log-likelihood = -1684.983941
##  AIC = 3375.967881
##  AICc = 3376.026275
##  free parameters = 3
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 15
##  frequency of best fit = 0.150
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates

With the value of lambda that best describes the phylogenetic structure, the branch lengths can be adjusted: best.

tree_rescaled<-rescale(x=tree_squamata, model = "lambda", 0.981244) # create the rescaled tree

write.tree(tree_rescaled, file = "C:/Users/Daniel/Desktop/results_R/tree_rescaled.txt")

pdf("C:/Users/Daniel/Desktop/results_R/tree_rescaled_america.pdf", width = 20, height = 20) # save the phylogenetic tree as a pdf with state of parietal eye as colored tips

18.2. Pagel`s Lambda in dependence of latitude for species in America (cleaned):

The same is true for squamates in America, as this result indicates a different situation.

lambda_data_america <- data.frame(row.names = matching_species_america) # create a data frame out of matching american species vector
rownames(lambda_data_america)<-gsub("_", " ", rownames(lambda_data_america)) # substitution
lambda_data_america$medianLat <- NA # create a new column
for (i in seq_len(nrow(lambda_data_america))) {
  name <- rownames(lambda_data_america)[i]
  matching_rows <- grep(name, coordsmedian.parietalfiltered_cleaned$scientificName) # searching for matching rows
  if (length(matching_rows) > 0) {
    lambda_data_america$medianLat[i] <- coordsmedian.parietalfiltered_cleaned$medianLat[matching_rows[1]] # add mean latitude to the new created column 
  }
}

lambda_data_america
rownames(lambda_data_america)<-gsub(" ", "_", rownames(lambda_data_america)) # substitution

#saveRDS(lambda_data_america, file= "C:/Users/Daniel/Desktop/results_R/lambda_data_america.RDS")
lambda_data_america<-readRDS("C:/Users/Daniel/Desktop/results_R/lambda_data_america.RDS")

results_geiger_lambda_data_america<- fitContinuous(phy = tree_squamata_america, dat = lambda_data_america)
results_geiger_lambda_data_america
## GEIGER-fitted comparative model of continuous data
##  fitted 'BM' model parameters:
##  sigsq = 1.551216
##  z0 = 16.952798
## 
##  model summary:
##  log-likelihood = -414.564381
##  AIC = 833.128762
##  AICc = 833.236870
##  free parameters = 2
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 100
##  frequency of best fit = 1.000
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates
#saveRDS(results_geiger_lambda_data_america, file= "C:/Users/Daniel/Desktop/results_R/results_geiger_lambda_data_america.RDS")
fitContinuous(phy= tree_squamata_america, dat= lambda_data_america, model= "lambda")
## GEIGER-fitted comparative model of continuous data
##  fitted 'lambda' model parameters:
##  lambda = 0.876633
##  sigsq = 0.950292
##  z0 = 17.183140
## 
##  model summary:
##  log-likelihood = -412.436030
##  AIC = 830.872061
##  AICc = 831.090242
##  free parameters = 3
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 14
##  frequency of best fit = 0.140
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates
tree_rescaled_america<-rescale(x=tree_squamata_america, model = "lambda", 0.876626) # create the rescaeld tree with the calculated value for lambda
tree_rescaled_america
## 
## Phylogenetic tree with 114 tips and 113 internal nodes.
## 
## Tip labels:
##   Anelytropsis_papillosus, Coleonyx_brevis, Coleonyx_variegatus, Coleonyx_mitratus, Coleonyx_elegans, Aristelliger_georgeensis, ...
## 
## Rooted; includes branch lengths.
write.tree(tree_rescaled_america, file = "C:/Users/Daniel/Desktop/results_R/tree_rescaled_america.txt")

pdf("C:/Users/Daniel/Desktop/results_R/tree_rescaled_america.pdf", width = 20, height = 20) # save the phylogenetic tree as a pdf with state of parietal eye as colored tips

18.0 Bayes Threshold analysation for phylogenetic signals:

Analysis of the data with the phylogenetic trees using Bayesian analysis.

tree_squamata <- read.tree(file = "C:/Users/Daniel/Desktop/results_R/tree_squamata.txt") # read in tree of squamates 
tree_squamata_america <- read.tree(file = "C:/Users/Daniel/Desktop/results_R/tree_squamata_america.txt") # read in tree of squamates in ameica

bayes_data <- data.frame(row.names = matching_species) # create a data frame with the matching species with Zheng´s tree

18.1. Bayes data parietal eye:

First, the data frames must be created with the species as the row name and the parietal eye as the column.

# add a new column with information about the parietal eye state
bayes_parietal<- bayes_data
rownames(bayes_parietal) <- gsub("_", " ", rownames(bayes_parietal))
bayes_parietal$parietal_eye<- NA # create a new column
for (i in seq_len(nrow(bayes_parietal))) {
  name <- rownames(bayes_parietal)[i]
  matching_row <- grep(name, coordsmedian.parietalfiltered_cleaned$scientificName, value = TRUE) # searching for matching rows
  if (length(matching_row) > 0) {
    bayes_parietal$parietal_eye[i] <- coordsmedian.parietalfiltered_cleaned$parietal_eye[grep(name, coordsmedian.parietalfiltered_cleaned$scientificName)][1] # add information about parietal eye to the new created column
  }
}

#saveRDS(bayes_parietal, file="C:/Users/Daniel/Desktop/results_R/bayes_parietal.RDS")

bayes_parietal <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_parietal.RDS")

19.2.1. Bayes data Latitude of whole world:

# add a new column with information about the mean latitudinal distribution
bayes_medianLat<- bayes_parietal
rownames(bayes_medianLat) <- gsub("_", " ", rownames(bayes_medianLat)) # substitution
bayes_medianLat$medianLat <- NA # create a new column
for (i in seq_len(nrow(bayes_medianLat))) {
  name <- rownames(bayes_medianLat)[i]
  matching_rows <- grep(name, coordsmedian.parietalfiltered_cleaned$scientificName) # searching for matching rows
  if (length(matching_rows) > 0) {
    bayes_medianLat$medianLat[i] <- coordsmedian.parietalfiltered_cleaned$medianLat[matching_rows[1]] # add the information of matching rows to the new created column 
  }
}
# Because of unequal scientific names in two cases the data of them have to be added manually
bayes_medianLat["Malayotyphlops ruber", "parietal_eye"] <- 0
bayes_medianLat["Malayotyphlops ruber", "medianLat"] <- 12.08537

bayes_medianLat["Hemidactylus imbricatus", "parietal_eye"] <- 0
bayes_medianLat["Hemidactylus imbricatus", "medianLat"] <- -29.1
bayes_medianLat$parietal_eye <- factor(bayes_medianLat$parietal_eye, levels = c("0", "1"))

#saveRDS(bayes_medianLat, file= "C:/Users/Daniel/Desktop/results_R/bayes_medianLat.RDS")

bayes_medianLat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_meanLat.RDS")

19.2.2. Bayes data Latitude of squamata within America:

Same as proceeded above is done for the species living in America

bayes_medianLat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_meanLat.RDS")
matching_species_america <- gsub("_", " ", matching_species_america) # substitution

bayes_species_america <- matching_species_america # create a new vector with only american species

bayes_medianLat_america <- subset(bayes_medianLat, rownames(bayes_medianLat) %in% bayes_species_america) # create a new data frame with information for bayesian analysation

bayes_medianLat_america$parietal_eye <- factor(bayes_medianLat_america$parietal_eye, levels = c("0", "1")) # assign the levels of the column as factor 

#saveRDS(bayes_medianLat_america, file= "C:/Users/Daniel/Desktop/results_R/bayes_medianLat_america.RDS")
bayes_medianLat_america <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_medianLat_america.RDS")

19.2.3. Bayes data circadian rhythm:

And, as with the circadian rhythm, the data frame is created for later analysis by “threshBayes”.

bayes_circadian <- data.frame(row.names = matching_species_circadian) # create a data frame by using matching species

# add a new column with information about the mean latitudinal distribution
rownames(bayes_circadian) <- gsub("_", " ", rownames(bayes_circadian))
bayes_circadian$circadian_rhythm <- NA # create a new column
for (i in seq_len(nrow(bayes_circadian))) {
  name <- rownames(bayes_circadian)[i]
  matching_rows <- grep(name, coordsmedian.parietal.ciricafiltered_cleaned$scientificName) # searching for matching rows
  if (length(matching_rows) > 0) {
    bayes_circadian$circadian_rhythm[i] <- coordsmedian.parietal.ciricafiltered_cleaned$circadian_rhythm[matching_rows[1]] # add information of the matching rows to the new created column "circadian_rhythm"
  }
}

bayes_circadian$parietal_eye<- NA # create a new column
for (i in seq_len(nrow(bayes_circadian))) {
  name <- rownames(bayes_circadian)[i]
  matching_row <- grep(name, coordsmedian.parietalfiltered_cleaned$scientificName, value = TRUE) # searching for matching rows
  if (length(matching_row) > 0) {
    bayes_circadian$parietal_eye[i] <- coordsmedian.parietalfiltered_cleaned$parietal_eye[grep(name, coordsmedian.parietalfiltered_cleaned$scientificName)][1] # add information about the parietal eye to the matching rows in the new created column
  }
}
rownames(bayes_circadian)<- gsub(" ", "_", rownames(bayes_circadian)) # substitution

#saveRDS(bayes_circadian, file= "C:/Users/Daniel/Desktop/results_R/bayes_circadian.RDS")

bayes_circadian <- readRDS("C:/Users/Daniel/Desktop/results_R/circadian.RDS")

19.2.3.. Bayes data habitat:

Create a species data frame with habitat information to calculate Threshold Bayes later.

bayes_habitat <- data.frame(row.names = matching_species_habitat) # create a data frame with matching species

# add a new column with information about the 
rownames(bayes_habitat) <- gsub("_", " ", rownames(bayes_habitat))
bayes_habitat$habitat_category <- NA # create a new column
for (i in seq_len(nrow(bayes_habitat))) {
  name <- rownames(bayes_habitat)[i]
  matching_rows <- grep(name, coordsmedian.parietal.habitatfiltered_cleaned$scientificName,) # searching for matching rows
  if (length(matching_rows) > 0) {
    bayes_habitat$habitat_category[i] <- coordsmedian.parietal.habitatfiltered_cleaned$habitat_category[matching_rows[1]] # add informatiom about the habitat category to the euqually named column, if the species names match with the rownames 
  }
}
#rename all habitats that are not of the category "Forest" to sunny while the category "Forest" is renamed to "shady", because there is only a binary structure possible for "threshBayes".
bayes_habitat$habitat_category <- ifelse(bayes_habitat$habitat_category == "Forest", "shady", "sunny")

bayes_habitat$parietal_eye<- NA # create a new column
for (i in seq_len(nrow(bayes_habitat))) {
  name <- rownames(bayes_habitat)[i]
  matching_row <- grep(name, coordsmedian.parietal.habitatfiltered_cleaned$scientificName, value = TRUE) # searching for matching rows
  if (length(matching_row) > 0) {
    bayes_habitat$parietal_eye[i] <- coordsmedian.parietal.habitatfiltered_cleaned$parietal_eye[grep(name, coordsmedian.parietal.habitatfiltered_cleaned$scientificName)][1] # add information about the parietal eye state to the new created column
  }
}
rownames(bayes_habitat)<- gsub(" ", "_", rownames(bayes_habitat)) # substitution

#saveRDS(bayes_habitat, file= "C:/Users/Daniel/results_R/Desktop/bayes_habitat.RDS")

#bayes_habitat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_habitat.RDS")

19.3.1. Results and plots of Thresh Bayes:

Evaluate the results and plot them.

bayes_result_medianLat <- threshBayes(tree_rescaled, bayes_medianLat, types = c("discrete", "continuous"), trait = 2, burnin= 0.5, ngen=100000) # calculate with "threshBayes" the dependence to "meanLat" and the parietal eye state using the phylogenetic tree
plot(bayes_result_medianLat)

plot(density(bayes_result_medianLat))

#saveRDS(bayes_result_medianLat, file="C:/Users/Daniel/Desktop/results_R/bayes_result_medianLat.RDS")
bayes_result_medianLat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_result_medianLat.RDS")
plot(bayes_result_medianLat)

19.3.2. Results and plots of Thresh Bayes for American species:

Evaluate the results and plot them.

rownames(bayes_medianLat_america) <- gsub(" ", "_", rownames(bayes_medianLat_america)) # substitution

bayes_result_medianLat_america <- threshBayes(tree_rescaled_america, bayes_medianLat_america, types = c("discrete", "continuous"), trait = 2, burnin= 0.5, ngen=100000) #  calculate with threshBayes the dependence to "meanLat" and the parietal eye state of American species using the phylogenetic tree 

plot(bayes_result_medianLat_america)
plot(density(bayes_result_medianLat_america))

#saveRDS(bayes_result_medianLat_america, file="C:/Users/Daniel/Desktop/results_R/bayes_result_medianLat_america.RDS")
bayes_result_medianLat_america <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_result_medianLat_america.RDS")
bayes_result_medianLat_america <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_result_medianLat_america.RDS")
plot(bayes_result_medianLat_america)

19.3.3. Results and plots of Thresh Bayes for species with circadian rhythm:

Evaluate the results and plot them.

bayes_circadian <- readRDS("C:/Users/Daniel/Desktop/results_R/circadian.RDS")
bayes_result_circadian <- threshBayes(tree_squamata_circadian, bayes_circadian, types = c("discrete", "discrete"), trait = 2, burnin= 0.5, ngen=100000) # calculate with "threshBayes" the dependence to "meanLat" and the parietal eye state using the phylogenetic tree

plot(bayes_result_circadian)

plot(density(bayes_result_circadian))

#saveRDS(bayes_result_circadian, file="C:/Users/Daniel/Desktop/results_R/bayes_result_circadian.RDS")
bayes_result_circadian <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_result_circadian.RDS")
plot(bayes_result_circadian)

19.3.4. Results and plots of Thresh Bayes for species with habitat:

Evaluate the results and plot them.

bayes_habitat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_habitat.RDS")
bayes_result_habitat <- threshBayes(tree_squamata_habitat, bayes_habitat, types = c("discrete", "discrete"), trait = 2, burnin= 0.5, ngen=100000) # calculate with "threshBayes" the dependence to "meanLat" and the parietal eye state using the phylogenetic tree

plot(bayes_result_habitat)

plot(density(bayes_result_habitat))

saveRDS(bayes_result_habitat, file="C:/Users/Daniel/Desktop/read in data/bayes_result_habitat.RDS")
bayes_result_habitat <- readRDS("C:/Users/Daniel/Desktop/results_R/bayes_result_habitat.RDS")
plot(bayes_result_habitat)

19.4.0. Statistical significance of Bayes correlation coefficient:

The correlation coefficient can be calculated with a t-test for its significance It can be calculated with the following formula:

t_value_bayes_median <- r * sqrt((n - 2) / (1 - r^2))
# for |r| = correlation value of "threshBayes" and n= sample size

19.4.1. Statistical analysation of Bayes correlation coefficient of species of the whole world:

The correlation coefficient can be calculated with a t-test for its significance

#n= 415, |r|= 0.00038
t_value_bayes_medianLat <- 0.00038 * sqrt((415 - 2) / (1 - (0.00038)^2))
t_value_bayes_medianLat
## [1] 0.007722513

19.4.2. Statistical analysation of Bayes correlation coefficient of species located in america:

The correlation coefficient can be calculated with a t-test for its significance

#n= 113, |r|= 0.27716
t_value_bayes_medianLat_america <- 0.27716 * sqrt((113 - 2) / (1 - (0.27716)^2))
t_value_bayes_medianLat_america
## [1] 3.039123

19.4.3. Statistical analysation of Bayes correlation coefficient in dependence towards ciecadian rhythm:

The correlation coefficient can be calculated with a t-test for its significance

#n= 120, |r|= 0.34057
t_value_bayes_circadian <-0.34057 * sqrt((120 - 2) / (1 - (0.34057)^2))
t_value_bayes_circadian
## [1] 3.93476

19.4.4. Statistical analysation of Bayes correlation coefficient in dependenctowards habitat categories:

The correlation coefficient can be calculated with a t-test for its significance

#n= 100, |r|= -0.28822
t_value_bayes_habitat1 <-0.28822 * sqrt((100 - 2) / (1 - (0.28822)^2))
t_value_bayes_habitat1
## [1] 2.979677