Read in phyloseq object
ps2 <- readRDS("output/rds/ps2.rds")
# export filtered
dir.create("output/otu_tables/filtered")
seqateurs::summarise_taxa(ps2, "species", "SampleID") %>%
spread(key="SampleID", value="totalRA") %>%
write.csv(file = "output/otu_tables/filtered/filtered_spp_sum.csv")
seqateurs::summarise_taxa(ps2, "genus", "SampleID") %>%
spread(key="SampleID", value="totalRA") %>%
write.csv(file = "output/otu_tables/filtered/filtered_gen_sum.csv")
#Rename taxa - only keep first 30 characters
taxa_names(ps2) <- substr(paste0("SV", seq(ntaxa(ps2)),"-",tax_table(ps2)[,7]), 1,30)
#Check carsonella presence
cars <- speedyseq::psmelt(ps2) %>%
filter(Abundance > 0) %>%
group_by(psyllid_spp) %>%
summarise(n = count(genus=="Candidatus Carsonella", na.rm = TRUE))
Create species merged table
# Merge species for beta diversity
ps.sppmerged <- ps2 %>%
merge_samples(group = "psyllid_spp", fun=mean)
#This loses the sample metadata - Need to add it agian
sample_data(ps.sppmerged) <- sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
filter(!duplicated(psyllid_spp)) %>%
magrittr::set_rownames(.$psyllid_spp)
seqs <- refseq(ps2)
tree <- phy_tree(ps2)
#make new phyloseq object
ps3 <- phyloseq(tax_table(ps.sppmerged),
sample_data(ps.sppmerged),
otu_table(otu_table(ps.sppmerged), taxa_are_rows = FALSE),
refseq(seqs),
phy_tree(tree))
Read in psyllid phylogeny
psyllid_tree <- read.tree(text=readLines("sample_data/psyllid_beast_tree.nwk"))
# Match names with sample sheet
psyllid_tree$tip.label <- psyllid_tree$tip.label %>%
str_squish() %>%
str_replace_all(pattern="\\.", replacement=" ") %>%
str_replace_all(pattern="Acizzia hakae", replacement="Acizzia hakeae") %>%
str_replace_all(pattern="POLLENISLAND", replacement="POLLEN ISLAND") %>%
str_replace_all(pattern="Ctenarytaina fuchsiae$", replacement="Ctenarytaina fuchsia A") %>%
str_replace_all(pattern="Ctenarytaina fuchsiaeB", replacement="Ctenarytaina fuchsia B") %>%
str_replace_all(pattern="Ctenarytaina fuchsiaeC", replacement="Ctenarytaina fuchsia C") %>%
str_replace_all(pattern="Ctenarytaina clavata", replacement="Ctenarytaina clavata sp ") %>%
str_replace_all(pattern="Ctenarytaina clavata sp $", replacement="Ctenarytaina clavata sp A") %>%
str_replace_all(pattern="Ctenarytaina sp$", replacement="Ctenarytaina sp ") %>%
str_replace_all(pattern="Ctenarytaina spA", replacement="Ctenarytaina sp A") %>%
str_replace_all(pattern="Ctenarytaina spB", replacement="Ctenarytaina sp B") %>%
str_replace_all(pattern="Ctenarytaina unknown", replacement="Ctenarytaina insularis") %>%
str_replace_all(pattern="Psylla apicalisA", replacement="Psylla frodobagginsi") %>%
str_replace_all(pattern="Psylla apicalisB", replacement="Psylla apicalis") %>%
str_replace_all(pattern="carmichaeliae", replacement="carmichaeliae ") %>%
str_replace_all(pattern="Trioza sp", replacement="Trioza sp ") %>%
str_replace_all(pattern="Trioza acutaB", replacement="Trioza acuta B") %>%
str_replace_all(pattern="Trioza gourlay", replacement="Trioza gourlayi") %>%
str_replace_all(pattern="BRENDAMAY", replacement="BRENDA MAY") %>%
str_replace_all(pattern="PRICES", replacement="PRICES VALLEY") %>%
str_replace_all(pattern="Acizzia sp", replacement="Acizzia errabunda") %>%
str_replace_all(pattern="Trioza ", replacement="Powellia ") %>%
str_replace_all(pattern="Triozid sp", replacement="Casuarinicola sp") %>%
str_replace_all(pattern="Powellia adventicia", replacement="Trioza adventicia") %>%
str_replace_all(pattern="Powellia curta", replacement="Trioza curta") %>%
str_replace_all(pattern=" ", replacement="_") %>%
trimws(which="right")
# Subset to only those in sample data
setdiff(psyllid_tree$tip.label,sample_data(ps2)$psyllid_spp)
setdiff(sample_data(ps2)$psyllid_spp, psyllid_tree$tip.label)
psyllid_tree$tip.label[!psyllid_tree$tip.label %in% sample_data(ps2)$psyllid_spp]
psyllid_tree$tip.label[!sample_data(ps2)$psyllid_spp %in% psyllid_tree$tip.label ]
pruned.tree <- drop.tip(psyllid_tree, psyllid_tree$tip.label[!psyllid_tree$tip.label %in% sample_data(ps2)$psyllid_spp] )
Summary statistics
# N unique species and samples
speedyseq::psmelt(ps2) %>%
summarise(ntaxa= n_distinct(psyllid_spp), n_samples = n_distinct(Sample_Name), n_hostplants = n_distinct(hostplant_spp))
# Spread of reads
speedyseq::psmelt(ps2) %>%
group_by(Sample_Name) %>%
summarise(Abundance = sum(Abundance)) %>%
ungroup() %>%
summarise(mean = mean(Abundance),
se = sd(Abundance)/sqrt(length(Abundance)),
max = max(Abundance),
min = min(Abundance))
# Spread of ASVs
speedyseq::psmelt(ps2) %>%
group_by(Sample_Name) %>%
dplyr::filter(Abundance > 0) %>%
summarise(counts = n_distinct(OTU)) %>%
ungroup() %>%
summarise(mean = mean(counts),
se = sd(counts)/sqrt(length(counts)),
max = max(counts),
min = min(counts))
#Fraction of reads assigned to each taxonomic rank
speedyseq::psmelt(ps2) %>%
gather("Rank","Name",rank_names(ps2)) %>%
group_by(Rank) %>%
mutate(Name = replace(Name, str_detect(Name, "__"),NA)) %>% # This line turns the "__" we added to lower ranks back to NA's
dplyr::summarise(Reads_classified = sum(Abundance * !is.na(Name))) %>%
mutate(Frac_reads = Reads_classified / sum(sample_sums(ps2))) %>%
mutate(Rank = factor(Rank, rank_names(ps2))) %>%
arrange(Rank)
#Fraction of ASV's assigned to each taxonomic rank
tax_table(ps2) %>%
as("matrix") %>%
as_tibble(rownames="OTU") %>%
gather("Rank","Name",rank_names(ps2)) %>%
group_by(Rank) %>%
mutate(Name = replace(Name, str_detect(Name, "__"), NA)) %>% # This line turns the "__" we added to lower ranks back to NA's
dplyr::summarise(OTUs_classified = sum(!is.na(Name))) %>%
mutate(Frac_OTUs = OTUs_classified / ntaxa(ps2)) %>%
mutate(Rank = factor(Rank, rank_names(ps2))) %>%
arrange(Rank)
# Unique taxa at each rank
speedyseq::psmelt(ps2) %>%
dplyr::select(rank_names(ps2)) %>%
pivot_longer(everything(),
names_to = "Rank",
values_to = "value") %>%
mutate(value = case_when(
str_detect(value, "__") ~ as.character(NA),
!str_detect(value, "__") ~ value
)) %>%
drop_na() %>%
group_by(Rank) %>%
summarise_all(funs(n_distinct)) %>%
mutate(Rank = factor(Rank, rank_names(ps2))) %>%
arrange(Rank)
# Each different phylum ranked by its overall relative abundance
sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
pull(psyllid_spp) %>%
table() %>%
sort()
# Transform to per sample relative abundance, then transform to whole dataset relative abundance
Prevalence / Abundance summary
View prevalence of different phyla across the dataset
# Calculate taxon prevalence across the data set at OTU level
prevdf <- apply(X = otu_table(ps2), MARGIN = ifelse(taxa_are_rows(ps2), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf <- data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(ps2),
tax_table(ps2))
#Prevalence plot
gg.prev <- subset(prevdf, phylum %in% get_taxa_unique(ps2, "phylum")) %>%
ggplot(aes(TotalAbundance, Prevalence / nsamples(ps2),color=order)) +
geom_point(size = 3, alpha = 0.7) +
scale_x_log10() +
xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap(~phylum) +
theme(legend.position="none") +
ggtitle("Phylum Prevalence in All Samples\nColored by Order")
pdf(file="figs/prevalence.pdf", width = 11, height = 8 , paper="a4r")
plot(gg.prev)
try(dev.off(), silent=TRUE)
# Prevalecne at phylum
ps.phylum <- speedyseq::tax_glom(ps2, taxrank="phylum")
prevdf_phylum <- apply(X = otu_table(ps.phylum ), MARGIN = ifelse(taxa_are_rows(ps.phylum ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_phylum <- data.frame(Prevalence = prevdf_phylum,
TotalAbundance = taxa_sums(ps.phylum),
tax_table(ps.phylum)) %>%
dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
remove_rownames() %>%
magrittr::set_rownames(.$phylum) %>%
dplyr::select(-rank_names(ps.phylum))
# Prevalence within Proteobacteria
ps.prot <- subset_taxa(ps2, phylum=="Proteobacteria") %>%
speedyseq::tax_glom(taxrank="order")
prevdf_prot <- apply(X = otu_table(ps.prot ), MARGIN = ifelse(taxa_are_rows(ps.prot ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_prot <- data.frame(Prevalence = prevdf_prot,
TotalAbundance = taxa_sums(ps.prot),
tax_table(ps.prot)) %>%
dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
remove_rownames() %>%
magrittr::set_rownames(.$order) %>%
dplyr::select(-rank_names(ps.prot))
# Genus Prevalence
ps.gen <- speedyseq::tax_glom(ps2, taxrank="genus")
prevdf_gen <- apply(X = otu_table(ps.gen ), MARGIN = ifelse(taxa_are_rows(ps.gen ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_gen <- data.frame(Prevalence = prevdf_gen,
TotalAbundance = taxa_sums(ps.gen),
tax_table(ps.gen)) %>%
dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
remove_rownames() %>%
mutate(genus = make.unique(genus)) %>%
magrittr::set_rownames(.$genus) %>%
dplyr::select(-rank_names(ps.gen))
# Genus Prevalence across species rather than specimens
ps.gen <- speedyseq::tax_glom(ps3, taxrank="genus")
prevdf_gen <- apply(X = otu_table(ps.gen ), MARGIN = ifelse(taxa_are_rows(ps.gen ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_gen <- data.frame(Prevalence = prevdf_gen,
TotalAbundance = taxa_sums(ps.gen),
tax_table(ps.gen)) %>%
dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
remove_rownames() %>%
mutate(genus = make.unique(genus)) %>%
magrittr::set_rownames(.$genus) %>%
dplyr::select(-rank_names(ps.gen))
# Prevalence of symbionts across psyllid species
speedyseq::psmelt(ps2) %>%
mutate(total_spp = n_distinct(psyllid_spp), total_specimen = n_distinct(Sample_Name)) %>%
filter(Abundance > 0) %>%
filter(genus %in% c("Candidatus Carsonella", "Arsenophonus", "Sodalis")) %>%
group_by(genus, total_spp, total_specimen)%>%
summarise(n_species = n_distinct(psyllid_spp), n_specimen = n_distinct(Sample_Name)) %>%
ungroup()%>%
mutate(prop_species = n_species / total_spp,
prop_specimen = n_specimen / total_specimen)
# Number of symbiont OTUs per psyllid species
speedyseq::psmelt(ps2) %>%
filter(Abundance > 0) %>%
filter(genus %in% c("Candidatus Carsonella", "Arsenophonus", "Sodalis")) %>%
group_by(psyllid_spp, genus) %>%
summarise(n = n_distinct(OTU)) %>%
ggplot(aes(x = psyllid_spp, y = n, fill=genus))+
geom_col(show.legend = FALSE)+
facet_grid(genus~.)+
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(x = "Psyllid Species",
y = "Number of distinct ASVs")
se <- function(x) sqrt(var(x)/length(x))
# Mean abundance of genera
genera_abund <- speedyseq::psmelt(ps2) %>%
filter(Abundance > 0) %>%
group_by(SampleID) %>%
mutate_at(vars(Abundance), ~ . / sum(.) ) %>%
ungroup %>%
group_by(genus) %>%
summarise(mean_ra = mean(Abundance), upper = max(Abundance), lower = min(Abundance), se = se(Abundance))
Alpha diversity metrics
dir.create("output/alpha")
# Get richness measures
richness <- phyloseq::estimate_richness(ps2, measures=c("Shannon")) %>%
rownames_to_column("Sample_Name") %>%
mutate(Sample_Name = Sample_Name %>%
str_remove("^X") %>%
str_replace_all("\\.", " "))
#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd <- picante::ses.pd(as(phyloseq::otu_table(ps2), "matrix"), phyloseq::phy_tree(ps2), null.model = "taxa.labels", include.root = F, runs = 99)
pd <- picante::pd(as(phyloseq::otu_table(ps2), "matrix"), phyloseq::phy_tree(ps2), include.root = FALSE)
# Join together
div_table <- pd %>%
rownames_to_column("Sample_Name") %>%
dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
left_join(richness, by="Sample_Name") %>%
left_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
filter(!duplicated(Sample_Name)) %>%
dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, psyllid_family, hostplant_spp, seqrun, genus_geo),
by = "Sample_Name")
# Summarise means
div_table %>%
summarise_if(is.numeric, mean)
# Difference between species for alpha diversity ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))
# Difference between all genera for alpha diversity ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table))
# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+genus_geo, data=div_table))
## Major genera only
# Difference between all major genera for alpha diversity ANOVA
div_table2 <- div_table %>%
dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))
mg_div <- bind_rows(broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+psyllid_genus,
data=div_table2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(Shannon ~seqrun+psyllid_family+psyllid_genus,
data=div_table2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+psyllid_genus,
data=div_table2))) %>% mutate(type="Phylogenetic"),
broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
data=div_table2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
data=div_table2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+genus_geo,
data=div_table2))) %>% mutate(type="Phylogenetic")
)
write_csv(mg_div, "output/alpha/major_genera_alpha.csv")
# Difference between major genera only ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table2))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table2))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table2))
# Difference between between major genera/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table2))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table2))
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table2))
# Association with phylogeny
dat <- div_table %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>% #Subset to common
group_by(psyllid_spp) %>%
dplyr::select(-where(is.character)) %>%
summarise_all(mean) %>%
arrange(match(psyllid_spp, pruned.tree$tip.label)) %>%
as.data.frame() %>%
magrittr::set_rownames(.$psyllid_spp) %>%
dplyr::select(-psyllid_spp)
# Add positive and negative controls
dat$random <- rnorm(length(dat$alpha), sd = 10) #Random association
dat$bm <- rTraitCont(pruned.tree) #Brownian motion
# Make phylosignal object and measure signal between univariate traits.
p4d <- phylobase::phylo4d(pruned.tree, dat)
signal <- phylosignal::phyloSignal(p4d = p4d, methods = c("I", "Lambda", "K"), reps = 999)%>%
as.data.frame() %>%
rownames_to_column("measure")
# print phylogenetic signal
signal
write_csv(signal, "output/alpha/phylosignal.csv")
# Locate signal
lipa <- lipaMoran(p4d, reps=999)
lipa.p4d <- lipaMoran(p4d, as.p4d = TRUE, reps=999)
barplot.phylo4d(lipa.p4d, bar.col = (lipa$p.value < 0.05) + 1, center = FALSE, scale = FALSE) + title("Non-rarefied")
#write out lipa
lipa_out <- cbind(lipa$lipa %>%
as.data.frame %>%
rename_all(funs(paste0(., "_stat"))),
lipa$p.value %>%
as.data.frame %>%
rename_all(funs(paste0(., "_pval")))
)%>%
rownames_to_column("psyllid_spp")
write_csv(lipa_out, "output/alpha/lipa.csv")
Rarefied
See if the pattern holds even with rarefaction to lowest sample
# Rarefied richness
ps2_rare <- rarefy_even_depth(ps2, sample.size = min(sample_sums(ps2)),
rngseed = 666, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
# Get richness measures
richness_rare <- phyloseq::estimate_richness(ps2_rare, measures=c("Shannon")) %>%
rownames_to_column("Sample_Name") %>%
mutate(Sample_Name = Sample_Name %>%
str_remove("^X") %>%
str_replace_all("\\.", " "))
#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd_rare <- picante::ses.pd(as(phyloseq::otu_table(ps2_rare), "matrix"), phyloseq::phy_tree(ps2_rare), null.model = #"taxa.labels", include.root = F, runs = 99)
pd_rare <- picante::pd(as(phyloseq::otu_table(ps2_rare), "matrix"), phyloseq::phy_tree(ps2_rare), include.root = FALSE)
# Join together
div_table_rare <- pd_rare %>%
rownames_to_column("Sample_Name") %>%
dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
left_join(richness, by="Sample_Name") %>%
left_join(sample_data(ps2_rare) %>%
as("matrix") %>%
as.data.frame() %>%
filter(!duplicated(Sample_Name)) %>%
dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, genus_geo),
by = "Sample_Name")
# Summarise means
div_table_rare %>%
summarise_if(is.numeric, mean)
# Difference between all major genera for alpha diversity ANOVA
div_table_rare2 <- div_table_rare %>%
dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))
mg_div_rare <- bind_rows(
broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+psyllid_genus,
data=div_table_rare2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(Shannon ~seqrun+psyllid_family+psyllid_genus,
data=div_table_rare2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+psyllid_genus,
data=div_table_rare2))) %>% mutate(type="Phylogenetic"),
broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
data=div_table_rare2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
data=div_table_rare2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+genus_geo,
data=div_table_rare2))) %>% mutate(type="Phylogenetic")
)
write_csv(mg_div_rare, "output/alpha/major_genera_alpha_rarefied.csv")
# Difference between genus factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))
# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))
dat <- div_table_rare %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>% #Subset to common
group_by(psyllid_spp) %>%
dplyr::select(-where(is.character)) %>%
summarise_all(mean) %>%
arrange(match(psyllid_spp, pruned.tree$tip.label)) %>%
as.data.frame() %>%
magrittr::set_rownames(.$psyllid_spp) %>%
dplyr::select(-psyllid_spp)
# Add positive and negative controls
dat$random <- rnorm(length(dat$alpha), sd = 10) #Random association
dat$bm <- rTraitCont(pruned.tree) #Brownian motion
# Make phylosignal object and measure signal between univariate traits.
p4d <- phylobase::phylo4d(pruned.tree, dat)
signal_rare <- phylosignal::phyloSignal(p4d = p4d, methods = c("I", "Lambda", "K"), reps = 999) %>%
as.data.frame() %>%
rownames_to_column("measure")
# print phylogenetic signal
signal_rare
write_csv(signal_rare, "output/alpha/phylosignal_rarefied.csv")
# Locate signal
lipa <- lipaMoran(p4d, reps=999)
lipa.p4d <- lipaMoran(p4d, as.p4d = TRUE, reps=999)
barplot.phylo4d(lipa.p4d, bar.col = (lipa$p.value < 0.05) + 1, center = FALSE, scale = FALSE) + title("Rarefied")
#write out lipa
lipa_out_rare <- cbind(lipa$lipa %>%
as.data.frame %>%
rename_all(funs(paste0(., "_stat"))),
lipa$p.value %>%
as.data.frame %>%
rename_all(funs(paste0(., "_pval")))
)%>%
rownames_to_column("psyllid_spp")
write_csv(lipa_out_rare, "output/alpha/lipa_rarefied.csv")
Alpha no Gammaproteobacteria
# Rarefied richness
ps2_subset <- ps2 %>%
subset_taxa(class != "Gammaproteobacteria") %>% #is this working?
filter_taxa(function(x) mean(x) > 0, TRUE)#Drop missing taxa from table
ps2_subset <- prune_samples(sample_sums(ps2_subset) >0 , ps2_subset)
message(nsamples(ps2) - nsamples(ps2_subset), " Samples and ", ntaxa(ps2) - ntaxa(ps2_subset), " taxa Dropped")
# Get richness measures
richness_subset <- phyloseq::estimate_richness(ps2_subset, measures=c("Shannon")) %>%
rownames_to_column("Sample_Name") %>%
mutate(Sample_Name = Sample_Name %>%
str_remove("^X") %>%
str_replace_all("\\.", " "))
#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd_subset <- picante::ses.pd(as(phyloseq::otu_table(ps2_subset), "matrix"), phyloseq::phy_tree(ps2_subset), null.model = "taxa.labels", include.root = F, runs = 99)
pd_subset <- picante::pd(as(phyloseq::otu_table(ps2_subset), "matrix"), phyloseq::phy_tree(ps2_subset), include.root = FALSE)
# Join together
div_table_subset <- pd_subset %>%
rownames_to_column("Sample_Name") %>%
dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
left_join(richness, by="Sample_Name") %>%
left_join(sample_data(ps2_subset) %>%
as("matrix") %>%
as.data.frame() %>%
filter(!duplicated(Sample_Name)) %>%
dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, genus_geo),
by = "Sample_Name")
# Summarise means
div_table_subset %>%
summarise_if(is.numeric, ~mean(.x, na.rm=TRUE))
# Difference between all major genera for alpha diversity ANOVA
div_table_subset2 <- div_table_subset %>%
dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))
mg_div_subset <- bind_rows(
broom::tidy(TukeyHSD(aov(alpha ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(Shannon ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Phylogenetic"),
broom::tidy(TukeyHSD(aov(alpha ~genus_geo, data=div_table_subset2))) %>% mutate(type="Richness"),
broom::tidy(TukeyHSD(aov(alpha ~genus_geo, data=div_table_subset2))) %>% mutate(type="Shannon"),
broom::tidy(TukeyHSD(aov(pd ~genus_geo, data=div_table_subset2))) %>% mutate(type="Phylogenetic")
)
write_csv(mg_div_subset, "output/alpha/major_genera_alpha_nogamma.csv")
# Difference between genus factors ANOVA
report::report(aov(alpha ~psyllid_genus, data=div_table_subset2))
report::report(aov(Shannon ~psyllid_genus, data=div_table_subset2))
report::report(aov(pd ~psyllid_genus, data=div_table_subset2))
# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~genus_geo, data=div_table_subset2))
report::report(aov(Shannon ~genus_geo, data=div_table_subset2))
report::report(aov(alpha ~genus_geo, data=div_table_subset2))
Beta diversity
Microbe distances
ps2_dist <- ps2
#ps2_dist <- ps2_filt
# Get OTU tables
otutab <- otu_table(ps2_dist)
#Impute zeroes for compositional distances
otutab_n0 <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))
#Root & label phylogenetic tree
phy_tree(ps2_dist) <- multi2di(phy_tree(ps2_dist))
phy_tree(ps2_dist) <- makeNodeLabel(phy_tree(ps2_dist), method="number", prefix='n')
name.balance(phy_tree(ps2_dist), tax_table(ps2_dist), 'n1')
#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")
distlist <- vector("list", length=length(metrics))
names(distlist) <- metrics
distlist$Jaccard <- as.matrix(vegdist(otutab, method="jac",binary = T))
distlist$Bray <- as.matrix(vegdist(otutab, method="bray"))
distlist$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_n0), method="euclidean"))
distlist$Philr <- as.matrix(vegdist(philr::philr(otutab_n0, phy_tree(ps2_dist),
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_dist, weighted=FALSE, parallel = TRUE))
distlist$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_dist, weighted=TRUE, parallel = TRUE))
# Create low abundance filtered dataset
filterfun1 <- function(x){
x[(x / sum(x)) < (1e-4)] <- 0
return(x)
}
ps2_filt <- transform_sample_counts(ps2, fun = filterfun1) %>%
filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table
print(paste0((ntaxa(ps2)-ntaxa(ps2_filt)), " taxa under threshold removed"))
# Get OTU tables
otutab <- otu_table(ps2_filt)
#Impute zeroes for compositional distances
otutab_n0 <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))
#Root & label phylogenetic tree
phy_tree(ps2_filt) <- multi2di(phy_tree(ps2_filt))
phy_tree(ps2_filt) <- makeNodeLabel(phy_tree(ps2_filt), method="number", prefix='n')
name.balance(phy_tree(ps2_filt), tax_table(ps2_filt), 'n1')
#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")
distlist_filt <- vector("list", length=length(metrics))
names(distlist_filt) <- metrics
distlist_filt$Jaccard <- as.matrix(vegdist(otutab, method="jac",binary = T))
distlist_filt$Bray <- as.matrix(vegdist(otutab, method="bray"))
distlist_filt$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_n0), method="euclidean"))
distlist_filt$Philr <- as.matrix(vegdist(philr::philr(otutab_n0, phy_tree(ps2_filt),
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist_filt$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_filt, weighted=FALSE, parallel = TRUE))
distlist_filt$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_filt, weighted=TRUE, parallel = TRUE))
# Create dataset without gammaproteoba
ps2_subset <- ps2 %>%
subset_taxa(class != "Gammaproteobacteria") %>% #is this working?
filter_taxa(function(x) mean(x) > 0, TRUE)#Drop missing taxa from table
ps2_subset <- prune_samples(sample_sums(ps2_subset) >0 , ps2_subset)
message(nsamples(ps2) - nsamples(ps2_subset), " Samples and ", ntaxa(ps2) - ntaxa(ps2_subset), " taxa Dropped")
# Get OTU tables
otutab_subset <- otu_table(ps2_subset)
#Impute zeroes for compositional distances
otutab_subset_n0 <- as.matrix(zCompositions::cmultRepl(otutab_subset, method="BL", output="p-counts"))
#Root phylogenetic tree
phy_tree(ps2_subset) <- multi2di(phy_tree(ps2_subset))
phy_tree(ps2_subset) <- makeNodeLabel(phy_tree(ps2_subset), method="number", prefix='n')
name.balance(phy_tree(ps2_subset), tax_table(ps2_subset), 'n1')
#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")
distlist_subset <- vector("list", length=length(metrics))
names(distlist_subset) <- metrics
distlist_subset$Jaccard <- as.matrix(vegdist(otutab_subset, method="jac",binary = T))
distlist_subset$Bray <- as.matrix(vegdist(otutab_subset, method="bray"))
distlist_subset$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_subset_n0), method="euclidean"))
distlist_subset$Philr <- as.matrix(vegdist(philr::philr(otutab_subset_n0, phy_tree(ps2_subset),
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist_subset$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_subset, weighted=FALSE, parallel = TRUE))
distlist_subset$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_subset, weighted=TRUE, parallel = TRUE))
# Mantel test to check concordance of beta diversity pre and post filtering
purrr::map2(distlist, distlist_filt,~{
subsample <- intersect(colnames(.x), colnames(.y))
as.data.frame(vegan::mantel(.x[subsample, subsample], .y[subsample, subsample])[c("statistic","signif","permutations")])
}) %>%
bind_rows(.id="dist")
# Mantel test to check concordance of beta diversity pre and post subset
purrr::map2(distlist, distlist_subset,~{
subsample <- intersect(colnames(.x), colnames(.y))
as.data.frame(vegan::mantel(.x[subsample, subsample], .y[subsample, subsample])[c("statistic","signif","permutations")])
}) %>%
bind_rows(.id="dist")
Adonis & Betadisper
# ADONIS is constructed heirarchially to marginalise techical variance then moving down the taxonomic ranks
# Adonis test
metadata <- sample_data(ps2) %>%
as("data.frame")
adonis_results <- distlist %>%
purrr::map(function(x) {
y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
bind_rows(
broom::tidy(adonis2(y~seqrun+psyllid_family+psyllid_genus+psyllid_spp, method="euclidean", data=metadata,
permutations=999, by="terms")) %>%
mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
#broom::tidy(adonis2(y~seqrun+hostplant_spp+psyllid_spp, method="euclidean", data=metadata,
# permutations=999, by="margin")) %>%
# mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
broom::tidy(adonis2(y~seqrun+hostplant_spp, method="euclidean", data=metadata,
permutations=999, by="terms")) %>%
mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-"))
)
}) %>%
bind_rows(.id="dist")
# Check homogeneity
betadisper_results <- distlist %>%
purrr::map(function(x) {
y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
bind_rows(
as_tibble(permutest(vegan::betadisper(y, metadata$psyllid_spp))$tab, rownames="term") %>%
mutate(test="psyllid_spp"),
as_tibble(permutest(vegan::betadisper(y, metadata$hostplant_spp))$tab, rownames="term") %>%
mutate(test="hostplant_spp"),
)
}) %>%
bind_rows(.id="dist")
dir.create("output/beta")
write_csv(adonis_results, "output/beta/adonis_fulldata.csv")
write_csv(betadisper_results, "output/beta/betadisper_fulldata.csv")
Same tree dissimilarities
Look at the similarities in the microbiome of the psyllid specimens collected from the same host plant
hostplant_metadata <- metadata %>% mutate(ingroup = case_when(
Sample_Name %in% c("94","107", "113","93","106", "112") ~ "fraxini-fraxinicola",
Sample_Name %in% c("200big", "201big", "200small", "201small") ~ "apicalis-frodobagginsi",
TRUE ~ "other"
))
broom::tidy(adonis2(distlist$Aitchison~ingroup + psyllid_spp, method="euclidean",
data=hostplant_metadata))
pairwise.adonis2(distlist$Aitchison~ingroup + psyllid_spp, method="euclidean",
data=hostplant_metadata)
Barplot
# Plot tree
p <- ggtree(pruned.tree) + geom_tiplab(align=TRUE) + geom_nodelab(geom='label') +
scale_x_continuous(expand=c(0, 0.1))
# Plot bar
ps3_bar <- ps3 %>%
speedyseq::tax_glom(taxrank = "order") %>% # agglomerate at Order level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
speedyseq::psmelt() %>%
mutate(plotlabel = phylum) %>%
mutate(plotlabel = case_when(
Abundance >= 0.01 & phylum=="Proteobacteria" ~ paste0("P - ", order), # Change this to whatever taxrank we want
Abundance >= 0.01 & !phylum=="Proteobacteria"~ phylum ,
Abundance < 0.01 ~ "NA"
)) %>%
dplyr::na_if("NA") %>%
dplyr::select(psyllid_spp, plotlabel, phylum, order, Abundance) %>%
left_join(p$data %>%
as_data_frame %>%
dplyr::filter(isTip) %>%
dplyr::select(y, label) %>%
dplyr::rename(psyllid_spp=label))
gg.bar <- ggplot(ps3_bar, aes(x=y, y=Abundance, fill=plotlabel)) +
geom_col() +
coord_flip()+
scale_fill_manual(values=colorRampPalette(brewer.pal(9, "Set1"))(length(unique(ps3_bar$plotlabel))-1), na.value="grey") +
base_theme +
theme(legend.position = "bottom",
#panel.grid.major.x = element_line(colour="grey92", size=0.5, linetype="dashed"),
strip.background = element_rect(fill = "grey92",
colour = "black", size = 1),
axis.text.y = element_blank(),
axis.ticks.y=element_blank(),
axis.title.y=element_blank()) +
scale_y_continuous(expand=c(0,0), labels = scales::percent)+
scale_x_continuous(expand=c(0,0)) +
labs(x = NULL ,
y = "Relative Abundance",
fill = NULL)
# Make richness plots
gg.rich <- div_table %>%
group_by(psyllid_spp) %>%
summarise(alpha=mean(alpha), pd=mean(pd)/10e+7, Shannon=mean(Shannon)) %>%
ungroup() %>%
pivot_longer(-psyllid_spp, names_to="measure",
values_to = "value") %>%
left_join(lipa_out %>%
dplyr::select(psyllid_spp, alpha_pval, pd_pval, Shannon_pval) %>%
pivot_longer(-psyllid_spp,
names_to="measure",
values_to = "pval") %>%
mutate(measure = str_remove(measure, "_pval"))) %>%
left_join(p$data %>%
as_data_frame %>%
dplyr::filter(isTip) %>%
dplyr::select(y, label) %>%
dplyr::rename(psyllid_spp=label)) %>%
mutate_at(vars(measure), funs(factor(., levels=c("alpha","Shannon","pd")))) %>%
ggplot(aes(x=y, y=value, fill=pval<0.05)) +
geom_col() +
facet_grid(~measure, scales="free") +
coord_flip()+
base_theme +
theme(legend.position = "bottom",
panel.grid.major.x = element_line(colour="grey92", size=0.5, linetype="dashed"),
# strip.background = element_rect(fill = "grey92",
# colour = "black", size = 1),
axis.text.y = element_blank(),
axis.ticks.y=element_blank(),
axis.title.y=element_blank(),
axis.text.x = element_text(angle=45, hjust=1)) +
scale_fill_manual(values=c("darkgray", "darkred")) +
scale_x_continuous(breaks=scales::pretty_breaks(n=1))+
scale_y_continuous(expand=c(0,0))+
scale_x_continuous(expand=c(0,0))
## Collection_hist
gg.spp <- sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::rename(label = psyllid_spp) %>%
group_by(label) %>%
summarise(n_species = n()) %>%
left_join(p$data %>%
filter(isTip) %>% dplyr::select(c(label, y))) %>%
filter(!is.na(y)) %>%
ggplot(aes(x=y, y=1, fill=n_species)) +
geom_tile() +
geom_text(aes(label=n_species))+
coord_flip() +
theme_void() +
theme(legend.position = "bottom") +
scale_fill_distiller(palette = "Reds", direction=1)+
scale_y_continuous(expand=c(0,0))+
scale_x_continuous(expand=c(0,0))
# Make tree with no underscores in name
p2 <- p
p2$data$label <- str_replace(p2$data$label, "_", " ")
#Arrange
Fig1 <- p2 + gg.spp + gg.bar + gg.rich + plot_layout(nrow=1, widths=c(1,0.08,2,0.6))
pdf(file="figs/Beta.pdf", width = 8, height = 11 , paper="a4")
plot(Fig1)
try(dev.off(), silent=TRUE)
Phylosymbiosis
Prepare distance matrices
## Psyllid phylogeny cophenetic distance matrix
phylo.dist <- cophenetic(pruned.tree) %>%
sqrt() %>%
as.data.frame() %>%
rownames_to_column("psyllid_spp.x") %>%
pivot_longer(cols=-psyllid_spp.x,
names_to="psyllid_spp.y",
values_to = "dist") %>%
right_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::select(Sample_Name, psyllid_spp) %>%
dplyr::rename(psyllid_spp.x = psyllid_spp, Sample_Name.x = Sample_Name),
by="psyllid_spp.x")%>%
right_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::select(Sample_Name, psyllid_spp) %>%
dplyr::rename(psyllid_spp.y = psyllid_spp, Sample_Name.y = Sample_Name),
by="psyllid_spp.y") %>%
filter(!is.na(dist)) %>%
dplyr::select(-psyllid_spp.y, -psyllid_spp.x) %>%
pivot_wider(names_from = Sample_Name.y, values_from = dist) %>%
column_to_rownames("Sample_Name.x") %>%
as.matrix()
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.dist <- cophenetic(plant.tree) %>%
sqrt() %>%
as.data.frame() %>%
rownames_to_column("hostplant_spp.x") %>%
pivot_longer(cols=-hostplant_spp.x,
names_to="hostplant_spp.y",
values_to = "dist") %>%
right_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::select(Sample_Name, hostplant_spp) %>%
dplyr::rename(hostplant_spp.x = hostplant_spp, Sample_Name.x = Sample_Name),
by="hostplant_spp.x")%>%
right_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::select(Sample_Name, hostplant_spp) %>%
dplyr::rename(hostplant_spp.y = hostplant_spp, Sample_Name.y = Sample_Name),
by="hostplant_spp.y") %>%
filter(!is.na(dist))%>%
dplyr::select(-hostplant_spp.y, -hostplant_spp.x) %>%
pivot_wider(names_from = Sample_Name.y, values_from = dist) %>%
column_to_rownames("Sample_Name.x") %>%
as.matrix()
# Spatial distance matrix
envData <- sample_data(ps2) %>%
as("matrix") %>%
as.data.frame() %>%
dplyr::select(long, lat) %>%
mutate(long = as.numeric(long), lat=as.numeric(lat))%>%
drop_na()
spat.dist <- spDists(as.matrix(envData), longlat=TRUE) %>%
as.data.frame() %>%
magrittr::set_rownames(rownames(envData) %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
magrittr::set_colnames(rownames(envData) %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
rownames_to_column("SampleID") %>%
mutate(SampleID = SampleID %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
column_to_rownames("SampleID") %>%
as.matrix()
Whole dataset
set.seed(909)
dir.create("output/phylosymbiosis")
# Matrix correlations
#only use samples present in all
subsample <- Reduce(intersect, list(rownames(otu_table(ps2)), colnames(phylo.dist), colnames(plant.dist), colnames(spat.dist)))
# Mantel test
mantel_results <- distlist %>%
purrr::map(function(x){
run_mantel(x, dists = c("phylo.dist", "plant.dist", "spat.dist"),
subsample = subsample, type ="mantel", nboot=1000)
}) %>%
bind_rows(.id="dist")
write_csv(mantel_results %>%
dplyr::select(-one_of("pval1","pval2")),#only keep two sided P values
"output/phylosymbiosis/mantel_fulldata.csv")
# Partial Mantel Test
pmantel_results <- distlist %>%
purrr::map(function(x){
run_mantel(x, dists = c("phylo.dist", "plant.dist", "spat.dist"),
subsample = subsample, type = "partial", nboot=1000)
}) %>%
bind_rows(.id="dist")
write_csv(pmantel_results %>% dplyr::select(-one_of("pval1","pval2")),
"output/phylosymbiosis/pmantel_fulldata.csv")
## Plot mantels
gg.mantels <- bind_rows(mantel_results, pmantel_results) %>%
dplyr::mutate(dist1 = case_when(
str_detect(dist1, "plant.dist") ~ "Hostplant phylogeny",
str_detect(dist1, "phylo.dist") ~ "Psyllid phylogeny",
str_detect(dist1, "spat.dist") ~ "Spatial distance"
)) %>%
filter(dist == "Aitchison") %>%
mutate(dist1 = factor(dist1, levels= c("Spatial distance","Hostplant phylogeny", "Psyllid phylogeny"))) %>%
mutate(type = type %>%
str_replace("mantel", "Mantel") %>%
str_replace("partial_Mantel", "Partial Mantel")) %>%
ggplot(aes(x=mantelr, y=dist1, colour=dist1)) +
geom_vline(xintercept = 0, colour="black", linetype=2) +
geom_pointrange(aes(xmin=`llim.2.5%`, xmax=`ulim.97.5%`), size=1) +
scale_color_manual(values=c("Hostplant phylogeny"="#b2df8a","Spatial distance"="#a6cee3", "Psyllid phylogeny"="#1f78b4"))+
facet_wrap(~type, ncol=2) +
labs( x = "Mantel R", y = NULL, colour=NULL) +
base_theme +
theme(legend.position = "none",
panel.grid.major = element_line())
gg.mantels
pdf(file="figs/fig3_mantels.pdf", width = 8, height = 4, paper="a4r")
plot(gg.mantels)
try(dev.off(), silent=TRUE)
Without Gammaproteobacteria
# Adonis test
metadata <- sample_data(ps2_subset) %>%
as("data.frame")
adonis_results <- distlist_subset %>%
purrr::map(function(x) {
y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
bind_rows(
broom::tidy(adonis2(y~seqrun+psyllid_family+psyllid_genus+psyllid_spp, method="euclidean", data=metadata,
permutations=999)) %>%
mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
broom::tidy(adonis2(y~seqrun+hostplant_spp, method="euclidean", data=metadata,
permutations=999)) %>%
mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-"))
)
}) %>%
bind_rows(.id="dist")
write_csv(adonis_results, "output/beta/adonis_subset.csv")
# Matrix correlations
#only use samples present in all
subsample <- Reduce(intersect, list(rownames(otu_table(ps2_subset)), colnames(phylo.dist), colnames(plant.dist), colnames(spat.dist)))
# Mantel test
mantel_results <- distlist_subset %>%
purrr::map(function(x){
run_mantel(x, dists=c("phylo.dist", "plant.dist", "spat.dist"),
subsample=subsample, type="mantel")
}) %>%
bind_rows(.id="dist")
write_csv(mantel_results %>% dplyr::select(-pval1, -pval2), "output/phylosymbiosis/mantel_subset.csv")
# Partial Mantel Test
pmantel_results <- distlist_subset %>%
purrr::map(function(x){
run_mantel(x, dists=c("phylo.dist", "plant.dist", "spat.dist"),
subsample=subsample, type="partial")
}) %>%
bind_rows(.id="dist")
write_csv(pmantel_results %>% dplyr::select(-pval1,-pval2), "output/phylosymbiosis/pmantel_subset.csv")
PCA plots
#set distance
pca_dist <- distlist$Aitchison
#PCA
r.pcx <- prcomp(pca_dist)
pc_samp <- data.frame(Sample_Name = rownames(r.pcx$x), r.pcx$x[, 1:2])%>%
left_join(sample_data(ps2) %>%
as("matrix") %>%
as.data.frame(), by="Sample_Name")
# calculate percent variance explained for the axis labels
pc1 <- round(r.pcx$sdev[1]^2/sum(r.pcx$sdev^2),2)
pc2 <- round(r.pcx$sdev[2]^2/sum(r.pcx$sdev^2),2)
pc_ylab <- paste("PC1: ", pc1, sep="")
pc_xlab <- paste("PC2: ", pc2, sep="")
## colour by psyllid phylogeny
dend <- as.dendrogram(force.ultrametric(pruned.tree))
membership <- as.data.frame(cutree(dend, k=11)) %>%
rownames_to_column("psyllid_spp") %>%
magrittr::set_colnames(c("psyllid_spp", "cluster"))
pca_phylo <- pc_samp %>%
left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) +
geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
#geom_point(data=pc_otu,aes(PC1, PC2)) +
theme_classic() +
scale_colour_brewer(palette="Paired") +
geom_hline(yintercept = 0, linetype=2, alpha=0.5) +
geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
xlab(pc_xlab) +
ylab(pc_ylab) +
theme(legend.position = "none") +
scale_y_reverse(position = "right")
#coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained
p1 <- ggtree(pruned.tree) +
scale_x_continuous(expand=c(0, 0.2)) +
theme_tree2() +
theme(legend.position = "none")
colours_p1 <- p1$data %>%
left_join(membership %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(new_label = label %>% str_replace_all("_", " "))
p1 <- p1 %<+% colours_p1 +
geom_tippoint(aes(colour=as.factor(cluster))) +
geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
scale_colour_brewer(palette="Paired")
gg.psyllid_pca <- p1 + gg.pca + plot_annotation(title="Psyllid phylogenetic distance")
gg.psyllid_pca
pdf(file="figs/psyllid_pca.pdf", width = 8, height = 11, paper="a4")
plot(gg.psyllid_pca)
try(dev.off(), silent=TRUE)
# Colour by plant phylogeny
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree <- drop.tip(plant.tree, "Sophora_microphylla_-kowhai" )
plant.tree <- multi2di(plant.tree)
dend <- as.dendrogram(plant.tree )
#test <- color_branches(dend, k=12)
#plot(test)
membership <- as.data.frame(cutree(dend, k=12)) %>%
rownames_to_column("hostplant_spp") %>%
magrittr::set_colnames(c("hostplant_spp", "cluster"))
pca_phylo <- pc_samp %>%
left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) +
geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
#geom_point(data=pc_otu,aes(PC1, PC2)) +
theme_classic() +
scale_colour_brewer(palette="Paired") +
geom_hline(yintercept = 0, linetype=2, alpha=0.5) +
geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
xlab(pc_xlab) +
ylab(pc_ylab) +
theme(legend.position = "none") +
scale_y_reverse(position = "right")
#coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained
p2 <- ggtree(plant.tree) +
scale_x_continuous(expand=c(0, 70)) +
theme_tree2() +
theme(legend.position = "none")
colours_p2 <- p2$data %>%
left_join(membership %>%
dplyr::rename(label = hostplant_spp)
)%>%
mutate(new_label = label %>% str_replace_all("_", " "))
p2 <- p2 %<+% colours_p2 +
geom_tippoint(aes(colour=as.factor(cluster))) +
geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
scale_colour_brewer(palette="Paired")
gg.plant_pca <- p2 + gg.pca + plot_annotation(title="Plant phylogenetic distance")
gg.plant_pca
pdf(file="figs/plant_pca.pdf", width = 8, height = 11, paper="a4")
plot(gg.plant_pca)
try(dev.off(), silent=TRUE)
# HC of spatial distance
dend <- hclust(as.dist(spat.dist), method="average")
#test <- color_branches(dend, k=12)
#plot(test)
membership <- as.data.frame(cutree(dend, k=12)) %>%
rownames_to_column("Sample_Name") %>%
magrittr::set_colnames(c("Sample_Name", "cluster"))
pca_phylo <- pc_samp %>%
left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) +
geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
#geom_point(data=pc_otu,aes(PC1, PC2)) +
theme_classic() +
scale_colour_brewer(palette="Paired") +
geom_hline(yintercept = 0, linetype=2, alpha=0.5) +
geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
xlab(pc_xlab) +
ylab(pc_ylab) +
theme(legend.position = "none") +
scale_y_reverse(position = "right")
#coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained
p3 <- ggtree(as.phylo(dend) )+
scale_x_continuous(expand=c(0, 400)) +
theme_tree2() +
theme(legend.position = "none")
colours_p3 <- p3$data %>%
left_join(membership %>%
dplyr::rename(label = Sample_Name)
)%>%
mutate(new_label = label %>% str_replace_all("_", " "))
p3 <- p3 %<+% colours_p3 +
geom_tippoint(aes(colour=as.factor(cluster))) +
#geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
scale_colour_brewer(palette="Paired")
gg.spat_pca <- p3 + gg.pca + plot_annotation(title="Spatial Distance")
gg.spat_pca
pdf(file="figs/spatial_pca.pdf", width = 8, height = 11, paper="a4")
plot(gg.spat_pca)
try(dev.off(), silent=TRUE)
Microbiome 3 genera heatmap
filterfun1 <- function(x){
x[(x / sum(x)) < (1e-4)] <- 0 #1e-4 is 0.01% threshold
return(x)
}
ps3_filt <- transform_sample_counts(ps3, fun = filterfun1)%>%
filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table
print(paste0((ntaxa(ps3)-ntaxa(ps3_filt)), " taxa under threshold removed"))
#Get co-occurance matrix
coocur <- ps3_filt %>%
subset_samples(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla")) %>%
filter_taxa(function(x) mean(x) > 0, TRUE) %>%
otu_table %>%
as.matrix() %>%
apply(2, function(x) ifelse(x > 0, 1, 0))
colnames(coocur) <- colnames(coocur) %>%
str_replace_all(" |-", "_")
rownames(coocur) <- rownames(coocur) %>%
str_replace_all(" |-", "_")
# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# Read in Paco runs
paco_run_powellia <- readRDS("output/cophylogeny/paco_powellia_microbe_filtered_asym.rds")
paco_run_cten <- readRDS("output/cophylogeny/paco_ctenarytaina_microbe_filtered_asym.rds")
paco_run_psylla <- readRDS("output/cophylogeny/paco_psylla_microbe_filtered_asym.rds")
paco_run_powellia$gof
paco_run_cten$gof
paco_run_psylla$gof
z_trans <- function(x){
(x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
}
# Get link importance
links <- do.call("list", mget(grep("paco_run_",names(.GlobalEnv),value=TRUE))) %>%
purrr::map(function(x){
genus_name <- rownames(x$H)[1] %>% str_remove("_.*$")
data.frame(
joint=names(x$jackknife),
values=unname(x$jackknife),
#upper=unname(x$jackknife$upper),
genus = genus_name
)
}) %>%
bind_rows() %>%
separate(joint, into=c("Sample_Name", "OTU"), sep="-", extra="merge", remove=FALSE) %>%
group_by(genus) %>%
mutate(values = values^2)%>%
mutate(mean_val = mean(values)) %>%
mutate(signif = case_when(
values < mean_val ~ 1,
values > mean_val ~ 0
))%>%
group_by(genus) %>%
mutate(
#st_dev = sd(values),
# z_values = values/st_dev
z_values = z_trans(values)
) %>%
ungroup()
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=reorder_within(label, values, genus))%>%
#arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) + #
geom_point(show.legend = FALSE) +
facet_wrap(~genus, scales = "free")+
#geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(aes(yintercept = mean_val), lty=3) +
scale_x_reordered()+
scale_color_manual(values=c("steelblue", "darkorange1"), name = "Significant") +
theme_classic()+
theme(axis.text.x = element_blank(), legend.position = "right")
#Get observed residuals of Procrustean superimposition
paco_residuals <- do.call("list", mget(grep("paco_run_",names(.GlobalEnv),value=TRUE))) %>%
purrr::map(function(x){
res <- residuals_paco(x$proc, type = "interaction")
genus_name <- rownames(x$H)[1] %>% str_remove("_.*$")
data.frame(
OTU=names(res),
values=unname(res),
genus=genus_name
)
})%>%
bind_rows()
# Plot residuals
ggplot(paco_residuals, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
facet_grid(genus~.) +
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run_powellia <- readRDS("output/cophylogeny/parafit_powellia_microbe_filtered.rds")
PF_run_cten <- readRDS("output/cophylogeny/parafit_ctenarytaina_microbe_filtered.rds")
PF_run_psylla <- readRDS("output/cophylogeny/parafit_psylla_microbe_filtered.rds")
PF_run_powellia$ParaFitGlobal
PF_run_powellia$p.global
PF_run_cten$ParaFitGlobal
PF_run_cten$p.global
PF_run_psylla$ParaFitGlobal
PF_run_psylla$p.global
# Joining isnt working here because they were made separately - so getting the rownames from cooccur no logner works
PF_links <- do.call("list", mget(grep("PF_run_",names(.GlobalEnv),value=TRUE))) %>%
purrr::map(function(x){
genus_name <- names(x$para.per.host[1]) %>% str_remove("_.*$")
as.data.frame(x$link.table) %>%
left_join(enframe(names(x$para.per.host), name = "Host", value="psyllid_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(x$host.per.para), name = "Parasite", value="OTU") %>%
mutate(Parasite = as.numeric(Parasite))) %>%
mutate(genus = genus_name)
}) %>%
bind_rows() %>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
)) %>%
group_by(genus) %>%
mutate(z_values = z_trans(F1.stat)) %>%
ungroup()
# Proportion of significant links
PF_links %>%
group_by(genus) %>%
summarise(s = count(signif > 0), ns = count(signif == 0))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=FALSE)
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(Sample_Name) %>%
summarise(values = mean(signif)) %>%
#summarise(values = mean(z_values)) %>%
dplyr::rename(label = Sample_Name)
)
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 +
scale_color_gradient(low="steelblue", high="darkorange1") +
# scale_colour_gradient(high="steelblue", low="darkorange1", trans="log10") +
theme(legend.position = "none")
# OTU tree
tree2 <- obj[["trees"]][[2]]
# make a clade label list
tax_groups <- as.data.frame(tax_table(ps3)) %>%
rownames_to_column("label") %>%
dplyr::mutate(label = label %>% str_replace_all("-", "_") %>%
str_replace_all(" ", "_")) %>%
dplyr::select(label, genus) %>%
filter(label %in% tree2$tip.label) %>%
group_by(genus)
group_name <- group_keys(tax_groups) %>%
mutate(group_name = genus %>% str_remove_all("\\[|\\]"))
cls <- tax_groups %>%
group_split() %>%
purrr::map(pull, label) %>%
set_names(group_name$group_name)
newtree <- groupOTU(tree2, cls)
p2 <- ggtree(newtree, ladderize=TRUE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(OTU) %>%
summarise(values = mean(signif)) %>%
#summarise(values = mean(z_values)) %>%
dplyr::rename(label = OTU)
)
#Should be able to use castor to make this faster
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 +
#scale_colour_gradient(high="steelblue", low="darkorange1", trans="log10") +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Plot heatmaps
heatmap_dat <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
mutate(genus = label.x %>% str_remove("_.*$")) %>%
left_join(links %>%
dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif, val_paco = z_values)) %>%
left_join(PF_links %>%
dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif, val_para = z_values)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
dplyr::rename(Sample_Name = label.x,
OTU = label.y,
pos_x = y.x,
pos_y = y.y) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(highlight = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
))
gg.heatmap <- heatmap_dat %>%
mutate(OTU = OTU %>% str_replace_all("_", " "),
Sample_Name = Sample_Name %>% str_replace_all("_"," ")) %>%
dplyr::select(Sample_Name, OTU, genus, highlight, pos_x, pos_y, val_para, val_paco) %>%
dplyr::mutate(Sample_Name = factor(Sample_Name),
OTU = factor(OTU),
genus=factor(genus, levels=c("Psylla" ,"Powellia", "Ctenarytaina"))) %>%
ggplot(aes(x= fct_reorder(Sample_Name, pos_x), y=fct_reorder(OTU, pos_y), fill=highlight )) +
geom_tile() +
theme_bw() +
facet_grid(~genus, drop = TRUE, scales="free", space ="free")+
theme(axis.text.x = element_text(angle=45, hjust=1),
axis.title.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
legend.position = "none") +
scale_y_discrete(expand=c(0,0))+
scale_y_discrete(expand=c(0,0)) +
#scale_fill_gradient(high="steelblue", low="darkorange1")
scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)
gg.heatmap
# Density plot of mismatch
density_dat <- heatmap_dat %>%
left_join(p2$data %>% dplyr::select(OTU = label, y)) %>%
group_by(OTU, y) %>%
summarise(sum_paco = sum(signif_paco, na.rm=TRUE), sum_para = sum(signif_para, na.rm=TRUE)) %>%
mutate(total_sum = sum(sum_paco, sum_para, na.rm=TRUE)) %>%
#summarise(total_sum = mean(val_paco, na.rm=TRUE)) %>%
ungroup()
density_labels <- density_dat %>%
left_join(tax_table(ps3) %>%
as("matrix") %>%
as_tibble(rownames="OTU") %>%
mutate(OTU = OTU %>% str_replace_all(" |-", "_")))%>%
arrange(y)
chunk = 50
n <- nrow(density_labels)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
density_labels_chunked <- split(density_labels,r)
density_labels <- density_labels_chunked %>%
purrr::map(function(x){
df <- x %>%
mutate(zscore = (total_sum - mean(total_sum, na.rm=TRUE))/sd(total_sum, na.rm=TRUE)) %>%
filter( total_sum > 3, zscore > 3,) #,
if(nrow(df) > 0){
out <- df %>%
summarise(y = mean(y), total_sum = max(total_sum), annot = case_when(
length(unique(genus)) == 1 ~ names(which.max(table(genus))),
length(unique(genus)) > 1~ names(which.max(table(family)))))
return(out)
}
}) %>%
bind_rows()
gg.density <- density_dat %>%
filter(total_sum > 0) %>%
ggplot(aes(x = y, y=total_sum, colour=total_sum)) +
geom_point(size=0.01, alpha=1)+
scale_color_gradient(low="steelblue", high="darkorange1") +
geom_text(data=density_labels, aes(label = annot), hjust=0) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_void() +
theme(legend.position = "none")+
coord_flip()
gg.density
# Instead of density could i label lineages with the greatest value?
top <- wrap_elements(grid::textGrob('')) +(p1+ coord_flip() + scale_x_reverse(expand=c(0,0))+ scale_y_continuous(expand=c(0,0))) + wrap_elements(grid::textGrob('')) +plot_layout(widths=c(0.5,3, 0.1))
bottom <- p2+ scale_y_continuous(expand=c(0,0)) + gg.heatmap + gg.density + plot_layout(widths=c(0.5,3, 0.1))
gg.treemap <- top / bottom + plot_layout(heights=c(0.5,3))
gg.treemap
pdf(file="figs/3genus_heatmaps.pdf", width = 8, height = 11, paper="a4")
plot(gg.treemap)
try(dev.off(), silent=TRUE)
# Make plot ranking links, and nodes
gg.host_links <- heatmap_dat %>%
dplyr::rename(label = Sample_Name) %>%
drop_na() %>%
group_by(label) %>%
summarise(signif_paco = sum(signif_paco), signif_para= sum(signif_para)) %>%
pivot_longer(starts_with("signif_"),
names_to="type",
values_to="values") %>%
mutate(type = type %>% str_remove("signif_")) %>%
mutate(label = label %>%
str_replace_all("_", " ") %>%
str_replace(" sp", " sp."))%>%
mutate(label = as.factor(label))%>%
filter(values > 0) %>%
arrange(-values) %>%
ggplot(aes(x=fct_reorder(label, values, sum ), y=values, fill=type)) +
geom_col() +
scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
theme_classic()+
theme(axis.text.y = element_text(face="italic"),
legend.position = "bottom")+
labs(y = "Number of Signficant links", x=NULL, title ="Psyllid taxa") +
coord_flip()
gg.sym_links <- heatmap_dat %>%
dplyr::rename(label = OTU) %>%
left_join(tax_groups %>% dplyr::rename(otu_genus = genus)) %>%
drop_na() %>%
group_by(otu_genus) %>%
summarise(signif_paco = sum(signif_paco), signif_para= sum(signif_para)) %>%
pivot_longer(starts_with("signif_"),
names_to="type",
values_to="values") %>%
dplyr::rename(label = otu_genus) %>%
dplyr::mutate(label = label %>%
str_remove("\\/.*$") %>%
str_replace("NA_canariense", "Bradyrhizobium_canariense") %>%
str_remove("^s__")
) %>%
mutate(type = type %>% str_remove("signif_")) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum ))%>%
group_by(label) %>%
mutate(total = sum(values)) %>%
ungroup() %>%
filter(total > 0) %>%
top_n(80, total) %>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, fill=type)) +
geom_col() +
scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
theme_classic()+
theme(axis.text.y = element_text(face="italic"),
legend.position = "bottom")+
labs(y = "Number of Signficant links", x=NULL, title ="Microbial genera") +
coord_flip()
gg.host_links + gg.sym_links
pdf(file="figs/3genus_fit_summary.pdf", width = 8, height = 11, paper="a4")
plot(gg.host_links + gg.sym_links)
try(dev.off(), silent=TRUE)
Tanglegrams
Psyllid ~ Carsonella
#Flag top abundance carsonella by sample
top_carson <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
filter(genus=="Candidatus Carsonella") %>%
group_by(Sample) %>%
filter(Abundance > 0) %>%
top_n(1, wt=Abundance) %>%
mutate(top = TRUE)
coocur <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
left_join(top_carson) %>%
filter(genus=="Candidatus Carsonella", top==TRUE) %>%
dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
mutate(OTU = OTU %>%
str_replace_all(" |-", "_")) %>%
dplyr::group_by(OTU, psyllid_spp) %>%
summarise(Abundance = sum(Abundance)) %>%
pivot_wider(id_cols = psyllid_spp,
names_from = OTU,
values_from=Abundance,
values_fill = list(Abundance = 0)) %>%
column_to_rownames("psyllid_spp") %>%
as.matrix() %>%
apply(2, function(x) ifelse(x > 0, 1, 0))
# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)
#print overall significance
print(paco_run$gof)
# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife), #losing sv50 here?
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
#geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_blank())
dir.create("output/cophylogeny/psyllid_carsonella")
write_csv(links, "output/cophylogeny/psyllid_carsonella/psyllid_carsonella_links.csv")
write_csv(links %>%
group_by(OTU) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_carsonella/carsonella_weights.csv")
write_csv(links %>%
group_by(Sample_Name) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_carsonella/psyllid_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# psyllid_tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1)
atmeto_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(node)
root_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(parent)
tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(Sample_Name) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = Sample_Name)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root
p1$data$node[p1$data$node]
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(OTU) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = OTU)
)%>%
left_join(PF_links %>%
group_by(OTU) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = OTU)
) %>%
mutate(values = PA_values + PF_values)
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y)) %>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
theme_void() +
scale_y_continuous(expand=c(0.005,0.005))+
theme(legend.position = "bottom") +
labs(colour="Significance:")
gg.carson_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.carson_tangle
pdf(file="figs/carsonella_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.carson_tangle)
try(dev.off(), silent=TRUE)
Psyllid ~ Sodalis
#Flag top abundance sodalis by sample
top_sodalis <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
filter(genus=="Sodalis") %>%
group_by(Sample) %>%
filter(Abundance > 0) %>%
top_n(1, wt=Abundance) %>%
mutate(top = TRUE)
coocur <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
left_join(top_sodalis) %>%
filter(genus=="Sodalis", top==TRUE) %>%
dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
mutate(OTU = OTU %>%
str_replace_all(" |-", "_")) %>%
dplyr::group_by(OTU, psyllid_spp) %>%
summarise(Abundance = sum(Abundance)) %>%
pivot_wider(id_cols = psyllid_spp,
names_from = OTU,
values_from=Abundance,
values_fill = list(Abundance = 0)) %>%
column_to_rownames("psyllid_spp") %>%
as.matrix() %>%
apply(2, function(x) ifelse(x > 0, 1, 0))
coocur <- coocur[ rowSums(coocur) > 0,colSums(coocur) > 0]
# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO
#alternatively - sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=TRUE)
#print overall significance
print(paco_run$gof)
# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife), #losing sv50 here?
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife$upper)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
# geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_blank())
dir.create("output/cophylogeny/psyllid_secondary")
write_csv(links, "output/cophylogeny/psyllid_secondary/psyllid_secondary_links.csv")
write_csv(links %>%
group_by(OTU) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/secondary_symbiont_weights.csv")
write_csv(links %>%
group_by(Sample_Name) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/psyllid_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# psyllid_tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1)
atmeto_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(node)
root_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(parent)
tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(Sample_Name) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = Sample_Name)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root
p1$data$node[p1$data$node]
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(OTU) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = OTU)
)%>%
left_join(PF_links %>%
group_by(OTU) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = OTU)
) %>%
mutate(values = PA_values + PF_values)
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y))%>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
theme_void() +
scale_y_continuous(expand=c(0.01,0.01))+
theme(legend.position = "bottom") +
labs(colour="Significance:")
gg.sodalis_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.sodalis_tangle
pdf(file="figs/sodalis_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.sodalis_tangle)
try(dev.off(), silent=TRUE)
Psyllid ~ Arsenophonus
# Subset to top arsenophonus
top_arse <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
filter(genus=="Arsenophonus") %>%
group_by(Sample) %>%
filter(Abundance > 0) %>%
top_n(1, wt=Abundance) %>%
mutate(top = TRUE)
coocur <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
left_join(top_arse) %>%
filter(genus=="Arsenophonus", top==TRUE) %>%
dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
mutate(OTU = OTU %>%
str_replace_all(" |-", "_")) %>%
dplyr::group_by(OTU, psyllid_spp) %>%
summarise(Abundance = sum(Abundance)) %>%
pivot_wider(id_cols = psyllid_spp,
names_from = OTU,
values_from=Abundance,
values_fill = list(Abundance = 0)) %>%
column_to_rownames("psyllid_spp") %>%
as.matrix() %>%
apply(2, function(x) ifelse(x > 0, 1, 0))
coocur <- coocur[ rowSums(coocur) > 0,colSums(coocur) > 0]
# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO
#alternatively - sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)
#print overall significance
print(paco_run$gof)
# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife),
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife$upper)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_blank())
dir.create("output/cophylogeny/psyllid_secondary")
write_csv(links, "output/cophylogeny/psyllid_secondary/psyllid_secondary_links.csv")
write_csv(links %>%
group_by(OTU) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/secondary_symbiont_weights.csv")
write_csv(links %>%
group_by(Sample_Name) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/psyllid_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# psyllid_tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1)
atmeto_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(node)
root_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(parent)
tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(Sample_Name) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = Sample_Name)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root
p1$data$node[p1$data$node]
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(OTU) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = OTU)
)%>%
left_join(PF_links %>%
group_by(OTU) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = OTU)
) %>%
mutate(values = PA_values + PF_values)
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y))%>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
theme_void() +
scale_y_continuous(expand=c(0.005,0.005))+
theme(legend.position = "bottom") +
labs(colour="Significance:")
gg.arse_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.arse_tangle
pdf(file="figs/arsenophonus_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.arse_tangle)
try(dev.off(), silent=TRUE)
Psyllid ~ hostplant
Tanglegram of all plants and psyllids!
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree <- drop.tip(plant.tree , "Sophora_microphylla_-kowhai" )
#Prepare co-occurance matrix
coocur <- sample_data(ps2) %>%
as_tibble() %>%
dplyr::select(psyllid_spp, hostplant_spp) %>%
filter(!is.na(psyllid_spp)) %>%
unique() %>%
mutate(psyllid_spp = psyllid_spp %>%
str_replace_all(" |-", "_"),
hostplant_spp = hostplant_spp %>%
str_replace_all(" |-", "_"),
presence = 1
) %>%
pivot_wider(names_from = "hostplant_spp",
values_from="presence",
values_fill = list(presence = 0)) %>%
column_to_rownames("psyllid_spp") %>%
t()
# H cophenetic distance
h_tree <- plant.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- pruned.tree
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)
#print overall significance
print(paco_run$gof)
# Get interaction-specific cophylogenetic contributions using leave-one-out jacknifing
paco_run <- paco_links(paco_run, .parallel = TRUE)
# get links
links <- data.frame(
joint=names(paco_run$jackknife),
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife$upper)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
# geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_blank())
dir.create("output/cophylogeny/psyllid_hostplant")
write_csv(links, "output/cophylogeny/psyllid_hostplant/psyllid_hostplant_links.csv")
write_csv(links %>%
group_by(psyllid_spp) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_hostplant/psyllid_weights.csv")
write_csv(links %>%
group_by(hostplant_spp) %>%
summarise(values = mean(values)), "output/cophylogeny/psyllid_hostplant/hostplant_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
#facet_wrap(~psyllid_genus) +
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=FALSE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="hostplant_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="psyllid_spp") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# Extract the goods for ggtree
# plant tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(hostplant_spp) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = hostplant_spp)
)%>%
left_join(PF_links %>%
group_by(hostplant_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = hostplant_spp)
) %>%
mutate(values = PA_values + PF_values)
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# psyllid_tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2)
atmeto_node <- p2$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(node)
root_node <- p2$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(parent)
tree2 <- groupOTU(tree2, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled
p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links, linetype=group))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(psyllid_spp) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p2$data[p2$data$node %in% atmeto_node, "x"] <- max(p2$data$x)
p2$data[p2$data$node %in% root_node, "x"] <- 0.2 #root
p2$data$node[p2$data$node]
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y))%>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=factor(tree, levels=c("microbe", "host")), y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
scale_y_continuous(expand=c(0.005,0.005))+
theme_void() +
theme(legend.position = "bottom") +
labs(colour="Significance")
gg.plant_tangle <- p2 + gg.tangle + (p1 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.plant_tangle
pdf(file="figs/plant_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.plant_tangle)
try(dev.off(), silent=TRUE)
Powellia
Powellia ~ Carsonella
#Flag top abundance carsonella by sample
top_carson <- ps2 %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
filter(genus=="Candidatus Carsonella") %>%
group_by(Sample) %>%
filter(Abundance > 0) %>%
top_n(1, wt=Abundance) %>%
mutate(top = TRUE)
coocur <- ps2 %>%
subset_samples(psyllid_genus == "Powellia") %>%
filter_taxa(function(x) mean(x) > 0, TRUE) %>%
transform_sample_counts(function (x) x/sum(x)) %>%
speedyseq::psmelt() %>%
filter(psyllid_spp %in% pruned.tree$tip.label) %>%
left_join(top_carson) %>%
filter(genus=="Candidatus Carsonella", top==TRUE) %>%
dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
mutate(OTU = OTU %>%
str_replace_all(" |-", "_")) %>%
dplyr::group_by(OTU, psyllid_spp) %>%
summarise(Abundance = sum(Abundance)) %>%
pivot_wider(id_cols = psyllid_spp,
names_from = OTU,
values_from=Abundance,
values_fill = list(Abundance = 0)) %>%
column_to_rownames("psyllid_spp") %>%
as.matrix() %>%
apply(2, function(x) ifelse(x > 0, 1, 0))
# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)
#print overall significance
print(paco_run$gof)
# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife),
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife$upper)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
#geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_blank())
dir.create("output/cophylogeny/trioza_carsonella")
write_csv(links, "output/cophylogeny/trioza_carsonella/trioza_carsonella_links.csv")
write_csv(links %>%
group_by(OTU) %>%
summarise(values = mean(values)), "output/cophylogeny/trioza_carsonella/carsonella_weights.csv")
write_csv(links %>%
group_by(Sample_Name) %>%
summarise(values = mean(values)), "output/cophylogeny/trioza_carsonella/psyllid_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=FALSE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# psyllid_tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1)
atmeto_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(node)
root_node <- p1$data %>%
filter(str_detect(label, "Atmeto")) %>%
pull(parent)
tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(Sample_Name) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = Sample_Name)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root
p1$data$node[p1$data$node]
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(OTU) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = OTU)
)%>%
left_join(PF_links %>%
group_by(OTU) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = OTU)
) %>%
mutate(values = PA_values + PF_values)
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y))%>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
theme_void() +
scale_y_continuous(expand=c(0.005,0.005))+
theme(legend.position = "bottom") +
labs(colour="Significance:")
gg.trioza_carson_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse())
gg.trioza_carson_tangle
pdf(file="figs/powellia_carsonella_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.trioza_carson_tangle)
try(dev.off(), silent=TRUE)
Powellia ~ hostplant
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree <- drop.tip(plant.tree , "Sophora_microphylla_-kowhai" )
#Prepare co-occurance matrix
coocur <- sample_data(ps2) %>%
as_tibble() %>%
filter(psyllid_genus == "Powellia") %>%
dplyr::select(psyllid_spp, hostplant_spp) %>%
filter(!is.na(psyllid_spp)) %>%
unique() %>%
mutate(psyllid_spp = psyllid_spp %>%
str_replace_all(" |-", "_"),
hostplant_spp = hostplant_spp %>%
str_replace_all(" |-", "_"),
presence = 1
) %>%
pivot_wider(names_from = "hostplant_spp",
values_from="presence",
values_fill = list(presence = 0)) %>%
column_to_rownames("psyllid_spp") %>%
t()
# H cophenetic distance
h_tree <- plant.tree
h_tree$tip.label <- h_tree$tip.label %>%
str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))
# P cophenetic distance
s_tree <- pruned.tree
s_tree$tip.label <- s_tree$tip.label %>%
str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw() +
ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
geom_point() +
theme_bw()+
ggtitle("P PCA")
plot(p_host + p_para)
# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE) #Symetric - is one meant to track the evolution of another?
#print overall significance
print(paco_run$gof)
# Procrustes diagnostic plots
plot(paco_run$proc)
plot(paco_run$proc, kind=2)
# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(
joint=names(paco_run$jackknife),
values=unname(paco_run$jackknife)#,
#upper=unname(paco_run$jackknife$upper)
) %>%
mutate(OTU = joint) %>%
separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge") %>%
mutate(signif = case_when(
values < mean(.$values) ~ 1,
values > mean(.$values) ~ 0
))
# Plot links
links %>%
dplyr::rename(label = joint) %>%
mutate(label = as.factor(label),
label=fct_reorder(label, values, sum))%>%
arrange(-values) %>%
ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
geom_point(show.legend = FALSE) +
#geom_errorbar(aes(ymin=values, ymax=upper)) +
geom_hline(yintercept = mean(links$values)) +
scale_color_manual(values=c("steelblue", "darkorange1")) +
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1))
dir.create("output/cophylogeny/trioza_hostplant")
write_csv(links, "output/cophylogeny/trioza_hostplant/psyllid_hostplant_links.csv")
write_csv(links %>%
group_by(psyllid_spp) %>%
summarise(values = mean(values)), "output/cophylogeny/trioza_hostplant/psyllid_weights.csv")
write_csv(links %>%
group_by(hostplant_spp) %>%
summarise(values = mean(values)), "output/cophylogeny/trioza_hostplant/hostplant_weights.csv")
#Get observed residuals of Procrustean superimposition
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge")
ggplot(res, aes(x=values))+
geom_density(fill='grey70')+
theme_bw()+
xlab('Procrustes residuals')+
ylab('Frequency')
# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global
PF_links <- as.data.frame(PF_run$link.table) %>%
left_join(enframe(names(PF_run$para.per.host), name = "Host", value="hostplant_spp") %>%
mutate(Host = as.numeric(Host))) %>%
left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="psyllid_spp") %>%
mutate(Parasite = as.numeric(Parasite)))%>%
mutate(signif = case_when(
p.F1 < 0.05 ~ 1,
p.F1 > 0.05 ~ 0
))
# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])
# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE)
# Extract the goods for ggtree
# plant tree
tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
left_join(links %>%
group_by(hostplant_spp) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = hostplant_spp)
)%>%
left_join(PF_links %>%
group_by(hostplant_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = hostplant_spp)
) %>%
mutate(values = PA_values + PF_values)
## Get values for higher nodes
weights_p1 <- weights_p1 %>%
left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# psyllid_tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links))
weights_p2 <- p2$data %>%
left_join(links %>%
group_by(psyllid_spp) %>%
summarise(PA_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
)%>%
left_join(PF_links %>%
group_by(psyllid_spp) %>%
summarise(PF_values = mean(signif)) %>%
dplyr::rename(label = psyllid_spp)
) %>%
mutate(values = PA_values + PF_values)
#Scale the atmetocranium and root nodes to be shorter
p2$data[p2$data$node %in% atmeto_node, "x"] <- max(p2$data$x)
p2$data[p2$data$node %in% root_node, "x"] <- 0.2 #root
p2$data$node[p2$data$node]
weights_p2 <- weights_p2 %>%
left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))
p2 <- p2 %<+% weights_p2 + geom_tippoint(aes(colour=links)) +
scale_color_gradient(low="steelblue", high="darkorange1") +
theme(legend.position = "none")
# Tanglegram
tangle <- obj$assoc %>%
as_data_frame() %>%
magrittr::set_colnames(c("label.x", "label.y")) %>%
left_join(links %>%
dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_paco=signif)) %>%
left_join(PF_links %>%
dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_para=signif)) %>%
left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
rownames_to_column("assoc") %>%
rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
pivot_longer(ends_with(c("_x", "_y")),
names_to=c(".value", "tree"),
names_sep = "_"
) %>%
mutate(signif_paco = replace_na(signif_paco, 0),
signif_para = replace_na(signif_para, 0)) %>%
mutate(signif = case_when(
signif_paco==0 & signif_para==0 ~ "NS",
signif_paco==0 & signif_para==1 ~ "para",
signif_paco==1 & signif_para==0 ~ "paco",
signif_paco==1 & signif_para==1 ~ "both"
)) %>%
mutate(tree = tree %>%
str_replace("x", "host")%>%
str_replace("y", "microbe")) %>%
filter(!is.na(label))%>%
group_by(tree) %>%
mutate(y = y / max(y))%>%
mutate(label = label %>% str_replace_all("_", " "))
gg.tangle <- ggplot(tangle, aes(x=factor(tree, levels=c("microbe", "host")), y=y, group=assoc, colour=as.factor(signif))) +
geom_line(alpha=0.8) +
geom_text(data = tangle %>%
filter(tree=="host"),
aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
geom_text(data = tangle %>%
filter(tree=="microbe"),
aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
scale_x_discrete(expand = expansion(add=c(0.8,0.8))) +
scale_y_continuous(expand=c(0.005,0.005))+
theme_void() +
theme(legend.position = "bottom") +
labs(colour="Significance")
gg.powellia_tangle <- p2 + gg.tangle + (p1 + scale_x_reverse())
gg.powellia_tangle
pdf(file="figs/powellia_plant_tanglegram.pdf", width = 8, height = 11, paper="a4")
plot(gg.powellia_tangle)
try(dev.off(), silent=TRUE)
---
title: "Psyllid microbiome"
subtitle: "Statistical analysis"
author: "Alexander Piper"
date: "`r Sys.Date()`"
output:
  html_document:
    includes:
      after_body: footer.html
    highlighter: null
    theme: "flatly"
    code_download: TRUE
    toc: TRUE
    toc_float:
      collapsed: FALSE
      smooth_scroll: TRUE
    df_print: paged
  pdf_document: default
editor_options: 
  chunk_output_type: console
---


```{r setup, eval=TRUE, warning=FALSE, message=FALSE, error=FALSE}
# Knitr global setup - change eval to true to run code
library(knitr)
knitr::opts_chunk$set(echo = TRUE, eval=FALSE, message=FALSE, error=FALSE,fig.show = "hold", fig.keep = "all")
opts_chunk$set(dev = 'png')
```


```{r load packages}
# Load packages

#Set required packages
.cran_packages <- c("tidyverse", 
                    "patchwork", 
                    "vegan", 
                    "seqinr",
                    "ape", 
                    "sp",
                    "maptools",
                    "rgeos",
                    "data.table", 
                    "RColorBrewer",
                    "ggtree", 
                    "castor", 
                    "picante",
                    "phylosignal", 
                    "adephylo",
                    "dendextend",
                    "paco",
                    "tidytext",
                    "phytools",
                    "ecodist")
.bioc_packages <- c("dada2",
                    "microbiome",
                    "phyloseq", 
                    "DECIPHER",
                    "Biostrings",
                    "ShortRead", 
                    "philr",
                    "ALDEx2")

# Install all missing packages
.inst <- .cran_packages %in% installed.packages()
if(any(!.inst)) {
   install.packages(.cran_packages[!.inst])
}
.inst <- .bioc_packages %in% installed.packages()
if(any(!.inst)) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  BiocManager::install(.bioc_packages[!.inst], ask = F)
}

#Load all packages
sapply(c(.cran_packages,.bioc_packages), require, character.only = TRUE)

# Github packages
devtools::install_github("alexpiper/taxreturn")
library(taxreturn)
devtools::install_github("alexpiper/seqateurs")
library(seqateurs)
devtools::install_github("mikemc/speedyseq")
library(speedyseq)
devtools::install_github('ggloor/CoDaSeq/CoDaSeq')
library(CoDaSeq)
devtools::install_github("easystats/report")
library(report)

devtools::install_github("pmartinezarbizu/pairwiseAdonis/pairwiseAdonis")
library(pairwiseAdonis)

#Source internal functions
source('R/BDTT.R')
source('R/phylosymbiosis.R')
source('R/helper_functions.R')
source('R/themes.R')
options(stringsAsFactors = FALSE)
```

# Read in phyloseq object

```{r read in phyloseq}
ps2 <- readRDS("output/rds/ps2.rds")

# export filtered
dir.create("output/otu_tables/filtered")
seqateurs::summarise_taxa(ps2, "species", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/filtered/filtered_spp_sum.csv")

seqateurs::summarise_taxa(ps2, "genus", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/filtered/filtered_gen_sum.csv")

#Rename taxa - only keep first 30 characters
taxa_names(ps2) <- substr(paste0("SV", seq(ntaxa(ps2)),"-",tax_table(ps2)[,7]), 1,30)

#Check carsonella presence
cars <- speedyseq::psmelt(ps2) %>%
  filter(Abundance > 0) %>%
  group_by(psyllid_spp) %>%
  summarise(n = count(genus=="Candidatus Carsonella", na.rm = TRUE))

```

## Create species merged table

```{r mergedspp}
# Merge species for beta diversity
ps.sppmerged <- ps2 %>%
    merge_samples(group = "psyllid_spp", fun=mean)

#This loses the sample metadata - Need to add it agian
sample_data(ps.sppmerged) <- sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  filter(!duplicated(psyllid_spp)) %>%
  magrittr::set_rownames(.$psyllid_spp)

seqs <- refseq(ps2)
tree <- phy_tree(ps2)
#make new phyloseq object
ps3 <- phyloseq(tax_table(ps.sppmerged),
               sample_data(ps.sppmerged),
               otu_table(otu_table(ps.sppmerged), taxa_are_rows = FALSE),
               refseq(seqs),
               phy_tree(tree))

```

## Read in psyllid phylogeny
```{R psyllid phylogeny}
psyllid_tree <- read.tree(text=readLines("sample_data/psyllid_beast_tree.nwk"))

# Match names with sample sheet
psyllid_tree$tip.label <- psyllid_tree$tip.label %>%
  str_squish() %>%
  str_replace_all(pattern="\\.", replacement=" ") %>%
  str_replace_all(pattern="Acizzia hakae", replacement="Acizzia hakeae") %>%
  str_replace_all(pattern="POLLENISLAND", replacement="POLLEN ISLAND") %>%
  str_replace_all(pattern="Ctenarytaina fuchsiae$", replacement="Ctenarytaina fuchsia A") %>%
  str_replace_all(pattern="Ctenarytaina fuchsiaeB", replacement="Ctenarytaina fuchsia B") %>%
  str_replace_all(pattern="Ctenarytaina fuchsiaeC", replacement="Ctenarytaina fuchsia C") %>%
  str_replace_all(pattern="Ctenarytaina clavata", replacement="Ctenarytaina clavata sp ") %>%
  str_replace_all(pattern="Ctenarytaina clavata sp $", replacement="Ctenarytaina clavata sp A") %>%
  str_replace_all(pattern="Ctenarytaina sp$", replacement="Ctenarytaina sp ") %>%
  str_replace_all(pattern="Ctenarytaina spA", replacement="Ctenarytaina sp A") %>%
    str_replace_all(pattern="Ctenarytaina spB", replacement="Ctenarytaina sp B") %>%
  str_replace_all(pattern="Ctenarytaina unknown", replacement="Ctenarytaina insularis") %>%  
  str_replace_all(pattern="Psylla apicalisA", replacement="Psylla frodobagginsi") %>%
  str_replace_all(pattern="Psylla apicalisB", replacement="Psylla apicalis") %>%
  str_replace_all(pattern="carmichaeliae", replacement="carmichaeliae ") %>%
  str_replace_all(pattern="Trioza sp", replacement="Trioza sp ") %>%
  str_replace_all(pattern="Trioza acutaB", replacement="Trioza acuta B") %>%
  str_replace_all(pattern="Trioza gourlay", replacement="Trioza gourlayi") %>%
  str_replace_all(pattern="BRENDAMAY", replacement="BRENDA MAY") %>%
  str_replace_all(pattern="PRICES", replacement="PRICES VALLEY") %>%  
  str_replace_all(pattern="Acizzia sp", replacement="Acizzia errabunda") %>% 
  str_replace_all(pattern="Trioza ", replacement="Powellia ") %>%
  str_replace_all(pattern="Triozid sp", replacement="Casuarinicola sp") %>%
  str_replace_all(pattern="Powellia adventicia", replacement="Trioza adventicia") %>%
  str_replace_all(pattern="Powellia curta", replacement="Trioza curta") %>%
  str_replace_all(pattern=" ", replacement="_") %>%
  trimws(which="right")


# Subset to only those in sample data
setdiff(psyllid_tree$tip.label,sample_data(ps2)$psyllid_spp)
setdiff(sample_data(ps2)$psyllid_spp, psyllid_tree$tip.label)

psyllid_tree$tip.label[!psyllid_tree$tip.label %in% sample_data(ps2)$psyllid_spp]
psyllid_tree$tip.label[!sample_data(ps2)$psyllid_spp %in% psyllid_tree$tip.label ]
pruned.tree <- drop.tip(psyllid_tree, psyllid_tree$tip.label[!psyllid_tree$tip.label %in% sample_data(ps2)$psyllid_spp] )

```


## Summary statistics

```{r sum taxa}
# N unique species and samples
speedyseq::psmelt(ps2) %>%
  summarise(ntaxa= n_distinct(psyllid_spp), n_samples = n_distinct(Sample_Name), n_hostplants = n_distinct(hostplant_spp))

# Spread of reads
speedyseq::psmelt(ps2) %>%
  group_by(Sample_Name) %>%
  summarise(Abundance = sum(Abundance)) %>%
  ungroup() %>%
  summarise(mean = mean(Abundance), 
            se = sd(Abundance)/sqrt(length(Abundance)),
            max = max(Abundance),
            min = min(Abundance))

# Spread of ASVs
speedyseq::psmelt(ps2) %>%
  group_by(Sample_Name) %>%
  dplyr::filter(Abundance > 0) %>%
  summarise(counts = n_distinct(OTU)) %>%
  ungroup() %>%
  summarise(mean = mean(counts), 
            se = sd(counts)/sqrt(length(counts)),
            max = max(counts),
            min = min(counts))

#Fraction of reads assigned to each taxonomic rank
speedyseq::psmelt(ps2) %>%
  gather("Rank","Name",rank_names(ps2)) %>%
  group_by(Rank) %>% 
  mutate(Name = replace(Name, str_detect(Name, "__"),NA)) %>% # This line turns the "__" we added to lower ranks back to NA's
  dplyr::summarise(Reads_classified = sum(Abundance * !is.na(Name))) %>%
  mutate(Frac_reads = Reads_classified / sum(sample_sums(ps2))) %>%
  mutate(Rank = factor(Rank, rank_names(ps2))) %>%
  arrange(Rank)

#Fraction of ASV's assigned to each taxonomic rank
tax_table(ps2) %>%
  as("matrix") %>%
  as_tibble(rownames="OTU") %>%
  gather("Rank","Name",rank_names(ps2)) %>%
  group_by(Rank) %>%
  mutate(Name = replace(Name, str_detect(Name, "__"), NA)) %>% # This line turns the "__" we added to lower ranks back to NA's
  dplyr::summarise(OTUs_classified = sum(!is.na(Name))) %>%
  mutate(Frac_OTUs = OTUs_classified / ntaxa(ps2)) %>%
  mutate(Rank = factor(Rank, rank_names(ps2))) %>%
  arrange(Rank)

# Unique taxa at each rank
speedyseq::psmelt(ps2) %>%
  dplyr::select(rank_names(ps2)) %>%
  pivot_longer(everything(),
               names_to = "Rank",
               values_to = "value") %>%
  mutate(value = case_when(
    str_detect(value, "__") ~ as.character(NA),
    !str_detect(value, "__") ~ value
  )) %>%
  drop_na() %>%
  group_by(Rank) %>%
  summarise_all(funs(n_distinct)) %>%
  mutate(Rank = factor(Rank, rank_names(ps2))) %>%
  arrange(Rank)

# Each different phylum ranked by its overall relative abundance
sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  pull(psyllid_spp) %>%
  table() %>%
  sort()
# Transform to per sample relative abundance, then transform to whole dataset relative abundance
```

## Prevalence / Abundance summary

View prevalence of different phyla across the dataset

```{r prevalence-assessment}
# Calculate taxon prevalence across the data set at OTU level
prevdf <- apply(X = otu_table(ps2), MARGIN = ifelse(taxa_are_rows(ps2), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf <- data.frame(Prevalence = prevdf, 
                     TotalAbundance = taxa_sums(ps2),
                     tax_table(ps2))
#Prevalence plot
gg.prev <- subset(prevdf, phylum %in% get_taxa_unique(ps2, "phylum")) %>%
  ggplot(aes(TotalAbundance, Prevalence / nsamples(ps2),color=order)) +
  geom_point(size = 3, alpha = 0.7) +
  scale_x_log10() +
  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~phylum) +
  theme(legend.position="none") +
  ggtitle("Phylum Prevalence in All Samples\nColored by Order")

pdf(file="figs/prevalence.pdf", width = 11, height = 8 , paper="a4r")
  plot(gg.prev)
try(dev.off(), silent=TRUE)
  
  
# Prevalecne at phylum
ps.phylum <- speedyseq::tax_glom(ps2, taxrank="phylum")
prevdf_phylum <- apply(X = otu_table(ps.phylum ), MARGIN = ifelse(taxa_are_rows(ps.phylum ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_phylum <- data.frame(Prevalence = prevdf_phylum, 
                     TotalAbundance = taxa_sums(ps.phylum),
                     tax_table(ps.phylum)) %>%
  dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
  remove_rownames() %>%
  magrittr::set_rownames(.$phylum) %>%
  dplyr::select(-rank_names(ps.phylum))

# Prevalence within Proteobacteria
ps.prot <- subset_taxa(ps2, phylum=="Proteobacteria") %>%
          speedyseq::tax_glom(taxrank="order")
prevdf_prot <- apply(X = otu_table(ps.prot ), MARGIN = ifelse(taxa_are_rows(ps.prot ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_prot <- data.frame(Prevalence = prevdf_prot, 
                     TotalAbundance = taxa_sums(ps.prot),
                     tax_table(ps.prot)) %>%
  dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
  remove_rownames() %>%
  magrittr::set_rownames(.$order) %>%
  dplyr::select(-rank_names(ps.prot))
  
# Genus Prevalence
ps.gen <- speedyseq::tax_glom(ps2, taxrank="genus") 
prevdf_gen <- apply(X = otu_table(ps.gen ), MARGIN = ifelse(taxa_are_rows(ps.gen ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_gen <- data.frame(Prevalence = prevdf_gen, 
                     TotalAbundance = taxa_sums(ps.gen),
                     tax_table(ps.gen)) %>%
  dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
  remove_rownames() %>%
  mutate(genus = make.unique(genus)) %>%
  magrittr::set_rownames(.$genus) %>%
  dplyr::select(-rank_names(ps.gen))

# Genus Prevalence across species rather than specimens
ps.gen <- speedyseq::tax_glom(ps3, taxrank="genus") 
prevdf_gen <- apply(X = otu_table(ps.gen ), MARGIN = ifelse(taxa_are_rows(ps.gen ), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
prevdf_gen <- data.frame(Prevalence = prevdf_gen, 
                     TotalAbundance = taxa_sums(ps.gen),
                     tax_table(ps.gen)) %>%
  dplyr::mutate(RA = TotalAbundance / sum(TotalAbundance)) %>%
  remove_rownames() %>%
  mutate(genus = make.unique(genus)) %>%
  magrittr::set_rownames(.$genus) %>%
  dplyr::select(-rank_names(ps.gen))

# Prevalence of symbionts across psyllid species
speedyseq::psmelt(ps2) %>%
  mutate(total_spp  = n_distinct(psyllid_spp), total_specimen = n_distinct(Sample_Name)) %>%
  filter(Abundance > 0) %>%
  filter(genus %in% c("Candidatus Carsonella", "Arsenophonus", "Sodalis")) %>%
  group_by(genus, total_spp, total_specimen)%>%
  summarise(n_species = n_distinct(psyllid_spp), n_specimen = n_distinct(Sample_Name)) %>%
  ungroup()%>%
  mutate(prop_species = n_species / total_spp,
         prop_specimen = n_specimen / total_specimen)

# Number of symbiont OTUs per psyllid species
speedyseq::psmelt(ps2) %>%
  filter(Abundance > 0) %>%
  filter(genus %in% c("Candidatus Carsonella", "Arsenophonus", "Sodalis")) %>%
  group_by(psyllid_spp, genus) %>%
  summarise(n = n_distinct(OTU)) %>%
  ggplot(aes(x = psyllid_spp, y = n, fill=genus))+
  geom_col(show.legend = FALSE)+
  facet_grid(genus~.)+
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(x = "Psyllid Species",
       y = "Number of distinct ASVs")

se <- function(x) sqrt(var(x)/length(x))

# Mean abundance of  genera
genera_abund <- speedyseq::psmelt(ps2) %>%
  filter(Abundance > 0) %>%
  group_by(SampleID) %>%
    mutate_at(vars(Abundance), ~ . / sum(.) ) %>%
  ungroup %>%
  group_by(genus) %>%
  summarise(mean_ra = mean(Abundance), upper = max(Abundance), lower = min(Abundance), se = se(Abundance)) 
  
```

## Alpha diversity metrics

```{r alpha diversity}
dir.create("output/alpha")
# Get richness measures
richness <- phyloseq::estimate_richness(ps2, measures=c("Shannon")) %>%
  rownames_to_column("Sample_Name") %>%
  mutate(Sample_Name = Sample_Name %>% 
           str_remove("^X") %>%
           str_replace_all("\\.", " "))

#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd <- picante::ses.pd(as(phyloseq::otu_table(ps2), "matrix"),  phyloseq::phy_tree(ps2), null.model = "taxa.labels", include.root = F, runs = 99)

pd <- picante::pd(as(phyloseq::otu_table(ps2), "matrix"),  phyloseq::phy_tree(ps2), include.root = FALSE)

# Join together
div_table <- pd %>%
  rownames_to_column("Sample_Name") %>%
  dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
  left_join(richness, by="Sample_Name") %>%
  left_join(sample_data(ps2) %>% 
              as("matrix") %>%
              as.data.frame() %>%
              filter(!duplicated(Sample_Name)) %>%
              dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, psyllid_family, hostplant_spp, seqrun, genus_geo),
            by = "Sample_Name") 

# Summarise means
div_table %>%
  summarise_if(is.numeric, mean)



# Difference between species for alpha diversity ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus+psyllid_spp, data=div_table))

# Difference between all genera for alpha diversity ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table))

# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table))
report::report(aov(pd ~seqrun+psyllid_family+genus_geo, data=div_table))

## Major genera only
# Difference between all major genera for alpha diversity ANOVA
div_table2 <- div_table %>%
  dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))

mg_div <- bind_rows(broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+psyllid_genus, 
                                   data=div_table2))) %>% mutate(type="Richness"),
          broom::tidy(TukeyHSD(aov(Shannon ~seqrun+psyllid_family+psyllid_genus,
                                   data=div_table2))) %>% mutate(type="Shannon"),
          broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+psyllid_genus,
                                   data=div_table2))) %>% mutate(type="Phylogenetic"),
          broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
                                   data=div_table2))) %>% mutate(type="Richness"),
          broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
                                   data=div_table2))) %>% mutate(type="Shannon"),
          broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+genus_geo,
                                   data=div_table2))) %>% mutate(type="Phylogenetic")
          )
write_csv(mg_div, "output/alpha/major_genera_alpha.csv")

# Difference between major genera only ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table2))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table2))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table2))

# Difference between between major genera/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table2))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table2))
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table2))

# Association with phylogeny
dat <- div_table  %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>% #Subset to common 
  group_by(psyllid_spp) %>%
  dplyr::select(-where(is.character)) %>%
  summarise_all(mean) %>%
  arrange(match(psyllid_spp, pruned.tree$tip.label)) %>%
  as.data.frame() %>%
  magrittr::set_rownames(.$psyllid_spp) %>%
  dplyr::select(-psyllid_spp)

# Add positive and negative controls
dat$random <- rnorm(length(dat$alpha), sd = 10) #Random association
dat$bm <- rTraitCont(pruned.tree) #Brownian motion

# Make phylosignal object and measure signal between univariate traits.
p4d <- phylobase::phylo4d(pruned.tree, dat)	
signal <- phylosignal::phyloSignal(p4d = p4d, methods = c("I", "Lambda", "K"), reps = 999)%>%
  as.data.frame() %>%
  rownames_to_column("measure")

# print phylogenetic signal
signal
write_csv(signal, "output/alpha/phylosignal.csv")

# Locate signal
lipa <- lipaMoran(p4d, reps=999)
lipa.p4d <- lipaMoran(p4d, as.p4d = TRUE, reps=999)
barplot.phylo4d(lipa.p4d, bar.col = (lipa$p.value < 0.05) + 1, center = FALSE, scale = FALSE) + title("Non-rarefied")

#write out lipa
lipa_out <- cbind(lipa$lipa %>%
                    as.data.frame %>%
                    rename_all(funs(paste0(., "_stat"))),
                  lipa$p.value %>%
                    as.data.frame %>%
                    rename_all(funs(paste0(., "_pval")))
                               )%>%
  rownames_to_column("psyllid_spp")
write_csv(lipa_out, "output/alpha/lipa.csv")
```

## Rarefied

See if the pattern holds even with rarefaction to lowest sample

```{r alpha rarefied}
# Rarefied richness
ps2_rare <- rarefy_even_depth(ps2, sample.size = min(sample_sums(ps2)),
  rngseed = 666, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

# Get richness measures
richness_rare <- phyloseq::estimate_richness(ps2_rare, measures=c("Shannon")) %>%
  rownames_to_column("Sample_Name") %>%
  mutate(Sample_Name = Sample_Name %>% 
           str_remove("^X") %>%
           str_replace_all("\\.", " "))

#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd_rare <- picante::ses.pd(as(phyloseq::otu_table(ps2_rare), "matrix"),  phyloseq::phy_tree(ps2_rare), null.model = #"taxa.labels", include.root = F, runs = 99)

pd_rare <- picante::pd(as(phyloseq::otu_table(ps2_rare), "matrix"),  phyloseq::phy_tree(ps2_rare), include.root = FALSE)

# Join together
div_table_rare <- pd_rare %>%
  rownames_to_column("Sample_Name") %>%
  dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
  left_join(richness, by="Sample_Name") %>%
  left_join(sample_data(ps2_rare) %>% 
              as("matrix") %>%
              as.data.frame() %>%
              filter(!duplicated(Sample_Name)) %>%
              dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, genus_geo),
            by = "Sample_Name") 

# Summarise means
div_table_rare %>%
  summarise_if(is.numeric, mean)

# Difference between all major genera for alpha diversity ANOVA
div_table_rare2 <- div_table_rare %>%
  dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))

mg_div_rare <- bind_rows(
  broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+psyllid_genus,
                           data=div_table_rare2))) %>% mutate(type="Richness"),
  broom::tidy(TukeyHSD(aov(Shannon ~seqrun+psyllid_family+psyllid_genus,
                           data=div_table_rare2))) %>% mutate(type="Shannon"),
  broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+psyllid_genus,
                           data=div_table_rare2))) %>% mutate(type="Phylogenetic"),
  broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
                           data=div_table_rare2))) %>% mutate(type="Richness"),
  broom::tidy(TukeyHSD(aov(alpha ~seqrun+psyllid_family+genus_geo,
                           data=div_table_rare2))) %>% mutate(type="Shannon"),
  broom::tidy(TukeyHSD(aov(pd ~seqrun+psyllid_family+genus_geo,
                           data=div_table_rare2))) %>% mutate(type="Phylogenetic")
          )
write_csv(mg_div_rare, "output/alpha/major_genera_alpha_rarefied.csv")

# Difference between genus factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))
report::report(aov(Shannon ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))
report::report(aov(pd ~seqrun+psyllid_family+psyllid_genus, data=div_table_rare2))

# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))
report::report(aov(Shannon ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))
report::report(aov(alpha ~seqrun+psyllid_family+genus_geo, data=div_table_rare2))


dat <- div_table_rare  %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>% #Subset to common 
  group_by(psyllid_spp) %>%
  dplyr::select(-where(is.character)) %>%
  summarise_all(mean) %>%
  arrange(match(psyllid_spp, pruned.tree$tip.label)) %>%
  as.data.frame() %>%
  magrittr::set_rownames(.$psyllid_spp) %>%
  dplyr::select(-psyllid_spp)

# Add positive and negative controls
dat$random <- rnorm(length(dat$alpha), sd = 10) #Random association
dat$bm <- rTraitCont(pruned.tree) #Brownian motion

# Make phylosignal object and measure signal between univariate traits.
p4d <- phylobase::phylo4d(pruned.tree, dat)	
signal_rare <- phylosignal::phyloSignal(p4d = p4d, methods = c("I", "Lambda", "K"), reps = 999) %>%
  as.data.frame() %>%
  rownames_to_column("measure")

# print phylogenetic signal
signal_rare
write_csv(signal_rare, "output/alpha/phylosignal_rarefied.csv")

# Locate signal
lipa <- lipaMoran(p4d, reps=999)
lipa.p4d <- lipaMoran(p4d, as.p4d = TRUE, reps=999)
barplot.phylo4d(lipa.p4d, bar.col = (lipa$p.value < 0.05) + 1, center = FALSE, scale = FALSE) + title("Rarefied")

#write out lipa
lipa_out_rare <- cbind(lipa$lipa %>%
                    as.data.frame %>%
                    rename_all(funs(paste0(., "_stat"))),
                  lipa$p.value %>%
                    as.data.frame %>%
                    rename_all(funs(paste0(., "_pval")))
                               )%>%
  rownames_to_column("psyllid_spp")
write_csv(lipa_out_rare, "output/alpha/lipa_rarefied.csv")    
```	

## Alpha no Gammaproteobacteria

```{r No gamma richness}
# Rarefied richness
ps2_subset <- ps2 %>%
 subset_taxa(class != "Gammaproteobacteria") %>% #is this working?
 filter_taxa(function(x) mean(x) > 0, TRUE)#Drop missing taxa from table
ps2_subset <- prune_samples(sample_sums(ps2_subset) >0 , ps2_subset)
message(nsamples(ps2) - nsamples(ps2_subset), " Samples and ", ntaxa(ps2) - ntaxa(ps2_subset), " taxa Dropped")

# Get richness measures
richness_subset <- phyloseq::estimate_richness(ps2_subset, measures=c("Shannon")) %>%
  rownames_to_column("Sample_Name") %>%
  mutate(Sample_Name = Sample_Name %>% 
           str_remove("^X") %>%
           str_replace_all("\\.", " "))

#Set number of randomisations for calculating significance
# Calculate Faith's PD-index & Species richness - with Standard errors
#sespd_subset <- picante::ses.pd(as(phyloseq::otu_table(ps2_subset), "matrix"),  phyloseq::phy_tree(ps2_subset), null.model = "taxa.labels", include.root = F, runs = 99)

pd_subset <- picante::pd(as(phyloseq::otu_table(ps2_subset), "matrix"),  phyloseq::phy_tree(ps2_subset), include.root = FALSE)

# Join together
div_table_subset <- pd_subset %>%
  rownames_to_column("Sample_Name") %>%
  dplyr::select(Sample_Name, alpha = SR, pd = PD) %>%
  left_join(richness, by="Sample_Name") %>%
  left_join(sample_data(ps2_subset) %>% 
              as("matrix") %>%
              as.data.frame() %>%
              filter(!duplicated(Sample_Name)) %>%
              dplyr::select(Sample_Name, psyllid_spp, psyllid_genus, genus_geo),
            by = "Sample_Name") 

# Summarise means
div_table_subset %>%
  summarise_if(is.numeric, ~mean(.x, na.rm=TRUE))

# Difference between all major genera for alpha diversity ANOVA
div_table_subset2 <- div_table_subset %>%
  dplyr::filter(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla"))

mg_div_subset <- bind_rows(
  broom::tidy(TukeyHSD(aov(alpha ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Richness"),
  broom::tidy(TukeyHSD(aov(Shannon ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Shannon"),
  broom::tidy(TukeyHSD(aov(pd ~psyllid_genus, data=div_table_subset2))) %>% mutate(type="Phylogenetic"),
  broom::tidy(TukeyHSD(aov(alpha ~genus_geo, data=div_table_subset2))) %>% mutate(type="Richness"),
  broom::tidy(TukeyHSD(aov(alpha ~genus_geo, data=div_table_subset2))) %>% mutate(type="Shannon"),
  broom::tidy(TukeyHSD(aov(pd ~genus_geo, data=div_table_subset2))) %>% mutate(type="Phylogenetic")
          )
write_csv(mg_div_subset, "output/alpha/major_genera_alpha_nogamma.csv")

# Difference between genus factors ANOVA
report::report(aov(alpha ~psyllid_genus, data=div_table_subset2))
report::report(aov(Shannon ~psyllid_genus, data=div_table_subset2))
report::report(aov(pd ~psyllid_genus, data=div_table_subset2))


# Difference between genus/geography factors ANOVA
report::report(aov(alpha ~genus_geo, data=div_table_subset2))
report::report(aov(Shannon ~genus_geo, data=div_table_subset2))
report::report(aov(alpha ~genus_geo, data=div_table_subset2))

```	

# Beta diversity

## Microbe distances
```{r distlist}
ps2_dist <- ps2
#ps2_dist <- ps2_filt

# Get OTU tables
otutab <- otu_table(ps2_dist)
#Impute zeroes for compositional distances
otutab_n0 <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))

#Root & label phylogenetic tree
phy_tree(ps2_dist) <- multi2di(phy_tree(ps2_dist))
phy_tree(ps2_dist) <- makeNodeLabel(phy_tree(ps2_dist), method="number", prefix='n')
name.balance(phy_tree(ps2_dist), tax_table(ps2_dist), 'n1')

#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")  
distlist <- vector("list", length=length(metrics))
names(distlist) <- metrics

distlist$Jaccard <- as.matrix(vegdist(otutab, method="jac",binary = T))
distlist$Bray <- as.matrix(vegdist(otutab, method="bray"))
distlist$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_n0), method="euclidean"))
distlist$Philr <- as.matrix(vegdist(philr::philr(otutab_n0, phy_tree(ps2_dist),
                                                part.weights='enorm.x.gm.counts',
                                                ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_dist, weighted=FALSE, parallel = TRUE))
distlist$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_dist, weighted=TRUE, parallel = TRUE))

# Create low abundance filtered dataset
filterfun1 <- function(x){
  x[(x / sum(x)) < (1e-4)] <- 0
  return(x)
}
ps2_filt  <- transform_sample_counts(ps2, fun = filterfun1) %>%
  filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table

print(paste0((ntaxa(ps2)-ntaxa(ps2_filt)), " taxa under threshold removed"))

# Get OTU tables
otutab <- otu_table(ps2_filt)
#Impute zeroes for compositional distances
otutab_n0 <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))
#Root & label phylogenetic tree
phy_tree(ps2_filt) <- multi2di(phy_tree(ps2_filt))
phy_tree(ps2_filt) <- makeNodeLabel(phy_tree(ps2_filt), method="number", prefix='n')
name.balance(phy_tree(ps2_filt), tax_table(ps2_filt), 'n1')

#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")  
distlist_filt <- vector("list", length=length(metrics))
names(distlist_filt) <- metrics

distlist_filt$Jaccard <- as.matrix(vegdist(otutab, method="jac",binary = T))
distlist_filt$Bray <- as.matrix(vegdist(otutab, method="bray"))
distlist_filt$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_n0), method="euclidean"))
distlist_filt$Philr <- as.matrix(vegdist(philr::philr(otutab_n0, phy_tree(ps2_filt),
                                                part.weights='enorm.x.gm.counts',
                                                ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist_filt$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_filt, weighted=FALSE, parallel = TRUE))
distlist_filt$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_filt, weighted=TRUE, parallel = TRUE))

# Create dataset without gammaproteoba
ps2_subset <- ps2 %>%
 subset_taxa(class != "Gammaproteobacteria") %>% #is this working?
 filter_taxa(function(x) mean(x) > 0, TRUE)#Drop missing taxa from table
ps2_subset <- prune_samples(sample_sums(ps2_subset) >0 , ps2_subset)
message(nsamples(ps2) - nsamples(ps2_subset), " Samples and ", ntaxa(ps2) - ntaxa(ps2_subset), " taxa Dropped")

# Get OTU tables
otutab_subset <- otu_table(ps2_subset)
#Impute zeroes for compositional distances
otutab_subset_n0 <- as.matrix(zCompositions::cmultRepl(otutab_subset, method="BL", output="p-counts"))
#Root phylogenetic tree
phy_tree(ps2_subset) <- multi2di(phy_tree(ps2_subset))
phy_tree(ps2_subset) <- makeNodeLabel(phy_tree(ps2_subset), method="number", prefix='n')
name.balance(phy_tree(ps2_subset), tax_table(ps2_subset), 'n1')

#Calculate different distance metrics
metrics <- c("Bray", "Jaccard", "Aitchison","Philr", "Unifrac", "WUnifrac")  
distlist_subset <- vector("list", length=length(metrics))
names(distlist_subset) <- metrics

distlist_subset$Jaccard <- as.matrix(vegdist(otutab_subset, method="jac",binary = T))
distlist_subset$Bray <- as.matrix(vegdist(otutab_subset, method="bray"))
distlist_subset$Aitchison <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(otutab_subset_n0), method="euclidean"))
distlist_subset$Philr <- as.matrix(vegdist(philr::philr(otutab_subset_n0, phy_tree(ps2_subset),
                                                part.weights='enorm.x.gm.counts',
                                                ilr.weights='blw.sqrt'), method="euclidean", na.rm=TRUE))
distlist_subset$Unifrac <- as.matrix(phyloseq::UniFrac(ps2_subset, weighted=FALSE, parallel = TRUE))
distlist_subset$WUnifrac <- as.matrix(phyloseq::UniFrac(ps2_subset, weighted=TRUE, parallel = TRUE))

# Mantel test to check concordance of beta diversity pre and post filtering

purrr::map2(distlist, distlist_filt,~{
  subsample <- intersect(colnames(.x), colnames(.y))
  as.data.frame(vegan::mantel(.x[subsample, subsample], .y[subsample, subsample])[c("statistic","signif","permutations")])
}) %>%
  bind_rows(.id="dist")

# Mantel test to check concordance of beta diversity pre and post subset

purrr::map2(distlist, distlist_subset,~{
  subsample <- intersect(colnames(.x), colnames(.y))
  as.data.frame(vegan::mantel(.x[subsample, subsample], .y[subsample, subsample])[c("statistic","signif","permutations")])
}) %>%
  bind_rows(.id="dist")
```


## Adonis & Betadisper

```{r Adonis}
# ADONIS is constructed heirarchially to marginalise techical variance then moving down the taxonomic ranks 

# Adonis test
metadata <- sample_data(ps2) %>%
  as("data.frame")
adonis_results <- distlist %>%
  purrr::map(function(x) {
    y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
    bind_rows(
    broom::tidy(adonis2(y~seqrun+psyllid_family+psyllid_genus+psyllid_spp, method="euclidean", data=metadata, 
                       permutations=999, by="terms")) %>% 
      mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
    #broom::tidy(adonis2(y~seqrun+hostplant_spp+psyllid_spp, method="euclidean", data=metadata, 
     #                  permutations=999, by="margin")) %>% 
    #  mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
    broom::tidy(adonis2(y~seqrun+hostplant_spp, method="euclidean", data=metadata,
                       permutations=999, by="terms")) %>% 
      mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-"))
    )
})  %>%
  bind_rows(.id="dist")

# Check homogeneity
betadisper_results <- distlist %>%
  purrr::map(function(x) {
    y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
  bind_rows(
    as_tibble(permutest(vegan::betadisper(y, metadata$psyllid_spp))$tab, rownames="term") %>%
      mutate(test="psyllid_spp"),
    as_tibble(permutest(vegan::betadisper(y, metadata$hostplant_spp))$tab, rownames="term")  %>%
      mutate(test="hostplant_spp"),
  )
})  %>%
  bind_rows(.id="dist")

dir.create("output/beta")
write_csv(adonis_results, "output/beta/adonis_fulldata.csv")
write_csv(betadisper_results, "output/beta/betadisper_fulldata.csv")
```

## Same tree dissimilarities

Look at the similarities in the microbiome of the psyllid specimens collected from the same host plant
```{r Avg Dissimiarity}
hostplant_metadata <- metadata %>% mutate(ingroup = case_when(
  Sample_Name %in% c("94","107", "113","93","106", "112") ~ "fraxini-fraxinicola",
  Sample_Name %in% c("200big", "201big", "200small", "201small") ~ "apicalis-frodobagginsi",
  TRUE ~ "other"
  ))

broom::tidy(adonis2(distlist$Aitchison~ingroup + psyllid_spp, method="euclidean",
                   data=hostplant_metadata))

pairwise.adonis2(distlist$Aitchison~ingroup + psyllid_spp, method="euclidean",
                   data=hostplant_metadata)
```

## Barplot

```{r barplot}
# Plot tree
p <- ggtree(pruned.tree) + geom_tiplab(align=TRUE) + geom_nodelab(geom='label') +
    scale_x_continuous(expand=c(0, 0.1)) 

# Plot bar
ps3_bar <- ps3 %>%
  speedyseq::tax_glom(taxrank = "order") %>%           # agglomerate at Order level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  speedyseq::psmelt() %>%
  mutate(plotlabel = phylum) %>%
  mutate(plotlabel = case_when(
    Abundance >= 0.01 & phylum=="Proteobacteria" ~ paste0("P - ", order), # Change this to whatever taxrank we want
    Abundance >= 0.01 & !phylum=="Proteobacteria"~ phylum ,
    Abundance < 0.01 ~ "NA"
    )) %>%
  dplyr::na_if("NA") %>%
  dplyr::select(psyllid_spp, plotlabel, phylum, order, Abundance) %>%
  left_join(p$data %>%
              as_data_frame %>%
              dplyr::filter(isTip) %>%
              dplyr::select(y, label) %>%
              dplyr::rename(psyllid_spp=label)) 

gg.bar <- ggplot(ps3_bar, aes(x=y, y=Abundance, fill=plotlabel)) +
  geom_col()  + 
  coord_flip()+
  scale_fill_manual(values=colorRampPalette(brewer.pal(9, "Set1"))(length(unique(ps3_bar$plotlabel))-1), na.value="grey") + 
    base_theme  +
    theme(legend.position = "bottom",
      #panel.grid.major.x = element_line(colour="grey92", size=0.5, linetype="dashed"),
      strip.background = element_rect(fill = "grey92", 
                    colour = "black", size = 1),
      axis.text.y = element_blank(),
      axis.ticks.y=element_blank(),
      axis.title.y=element_blank()) +
  scale_y_continuous(expand=c(0,0), labels = scales::percent)+
  scale_x_continuous(expand=c(0,0)) +
  labs(x = NULL ,
       y = "Relative Abundance",
       fill = NULL)

# Make richness plots
gg.rich <-  div_table %>%
  group_by(psyllid_spp) %>%
  summarise(alpha=mean(alpha), pd=mean(pd)/10e+7, Shannon=mean(Shannon)) %>%
  ungroup() %>%
  pivot_longer(-psyllid_spp, names_to="measure",
               values_to = "value")  %>%
  left_join(lipa_out %>% 
              dplyr::select(psyllid_spp, alpha_pval, pd_pval, Shannon_pval) %>%
              pivot_longer(-psyllid_spp,
                           names_to="measure",
               values_to = "pval") %>%
              mutate(measure = str_remove(measure, "_pval"))) %>%
  left_join(p$data %>%
              as_data_frame %>%
              dplyr::filter(isTip) %>%
              dplyr::select(y, label) %>%
              dplyr::rename(psyllid_spp=label)) %>%
  mutate_at(vars(measure), funs(factor(., levels=c("alpha","Shannon","pd")))) %>%
  ggplot(aes(x=y, y=value, fill=pval<0.05)) + 
    geom_col() +
    facet_grid(~measure, scales="free") + 
    coord_flip()+
    base_theme  +
    theme(legend.position = "bottom",
    panel.grid.major.x = element_line(colour="grey92", size=0.5, linetype="dashed"),
   # strip.background = element_rect(fill = "grey92", 
    #              colour = "black", size = 1),
    axis.text.y = element_blank(),
    axis.ticks.y=element_blank(),
    axis.title.y=element_blank(),
    axis.text.x = element_text(angle=45, hjust=1)) +
    scale_fill_manual(values=c("darkgray", "darkred")) +
    scale_x_continuous(breaks=scales::pretty_breaks(n=1))+
  scale_y_continuous(expand=c(0,0))+
  scale_x_continuous(expand=c(0,0))

## Collection_hist
gg.spp <- sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  dplyr::rename(label = psyllid_spp) %>%
  group_by(label) %>%
  summarise(n_species = n()) %>%
  left_join(p$data %>%
  filter(isTip) %>% dplyr::select(c(label, y))) %>%
  filter(!is.na(y)) %>%
  ggplot(aes(x=y, y=1, fill=n_species)) +
    geom_tile() +
    geom_text(aes(label=n_species))+
    coord_flip() +
    theme_void() +
    theme(legend.position = "bottom") +
    scale_fill_distiller(palette = "Reds", direction=1)+
  scale_y_continuous(expand=c(0,0))+
  scale_x_continuous(expand=c(0,0))

# Make tree with no underscores in name
p2 <- p
p2$data$label <- str_replace(p2$data$label, "_", " ")

#Arrange
Fig1 <- p2 + gg.spp + gg.bar + gg.rich + plot_layout(nrow=1, widths=c(1,0.08,2,0.6))  

pdf(file="figs/Beta.pdf", width = 8, height = 11 , paper="a4")
  plot(Fig1)
try(dev.off(), silent=TRUE)
```

## Phylosymbiosis

### Prepare distance matrices
```{r prepare distances}
## Psyllid phylogeny cophenetic distance matrix
phylo.dist <- cophenetic(pruned.tree) %>%
   sqrt() %>%
  as.data.frame() %>%
  rownames_to_column("psyllid_spp.x") %>%
  pivot_longer(cols=-psyllid_spp.x,
               names_to="psyllid_spp.y",
               values_to = "dist") %>%
  right_join(sample_data(ps2) %>%
               as("matrix") %>%
               as.data.frame() %>%
              dplyr::select(Sample_Name, psyllid_spp) %>%
              dplyr::rename(psyllid_spp.x = psyllid_spp, Sample_Name.x = Sample_Name),
            by="psyllid_spp.x")%>%
  right_join(sample_data(ps2) %>%
               as("matrix") %>%
               as.data.frame() %>%
              dplyr::select(Sample_Name, psyllid_spp) %>%
              dplyr::rename(psyllid_spp.y = psyllid_spp, Sample_Name.y = Sample_Name),
            by="psyllid_spp.y") %>%
  filter(!is.na(dist)) %>%
  dplyr::select(-psyllid_spp.y, -psyllid_spp.x) %>%
  pivot_wider(names_from = Sample_Name.y, values_from = dist)  %>%
  column_to_rownames("Sample_Name.x") %>%
  as.matrix()

plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.dist <- cophenetic(plant.tree) %>%
  sqrt() %>%
  as.data.frame() %>%
  rownames_to_column("hostplant_spp.x") %>%
  pivot_longer(cols=-hostplant_spp.x,
               names_to="hostplant_spp.y",
               values_to = "dist") %>%
  right_join(sample_data(ps2) %>%
               as("matrix") %>%
               as.data.frame() %>%
              dplyr::select(Sample_Name, hostplant_spp) %>%
              dplyr::rename(hostplant_spp.x = hostplant_spp, Sample_Name.x = Sample_Name),
            by="hostplant_spp.x")%>%
  right_join(sample_data(ps2) %>%
               as("matrix") %>%
               as.data.frame() %>%
              dplyr::select(Sample_Name, hostplant_spp) %>%
              dplyr::rename(hostplant_spp.y = hostplant_spp, Sample_Name.y = Sample_Name),
            by="hostplant_spp.y") %>%
  filter(!is.na(dist))%>%
  dplyr::select(-hostplant_spp.y, -hostplant_spp.x) %>%
  pivot_wider(names_from = Sample_Name.y, values_from = dist) %>%
  column_to_rownames("Sample_Name.x") %>%
  as.matrix()

# Spatial distance matrix
envData <- sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  dplyr::select(long, lat) %>%
  mutate(long = as.numeric(long), lat=as.numeric(lat))%>%
  drop_na() 
  
spat.dist <- spDists(as.matrix(envData), longlat=TRUE) %>%
  as.data.frame() %>%
  magrittr::set_rownames(rownames(envData) %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
  magrittr::set_colnames(rownames(envData) %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
  rownames_to_column("SampleID") %>%
  mutate(SampleID = SampleID %>% str_replace(pattern="\\_S(.*)$",replacement="") %>% make.unique()) %>%
  column_to_rownames("SampleID") %>%
  as.matrix()

```

### Whole dataset

```{r phylosymbiosis}
set.seed(909)
dir.create("output/phylosymbiosis")

# Matrix correlations
#only use samples present in all
subsample <- Reduce(intersect, list(rownames(otu_table(ps2)), colnames(phylo.dist), colnames(plant.dist), colnames(spat.dist)))

# Mantel test
mantel_results <- distlist %>%
  purrr::map(function(x){
    run_mantel(x, dists = c("phylo.dist", "plant.dist", "spat.dist"),
               subsample = subsample, type  ="mantel", nboot=1000)
  }) %>%
  bind_rows(.id="dist")

write_csv(mantel_results %>%
            dplyr::select(-one_of("pval1","pval2")),#only keep two sided P values
          "output/phylosymbiosis/mantel_fulldata.csv")


# Partial Mantel Test
pmantel_results <- distlist %>%
  purrr::map(function(x){
    run_mantel(x, dists = c("phylo.dist", "plant.dist", "spat.dist"),
               subsample = subsample, type = "partial", nboot=1000)
  }) %>%
  bind_rows(.id="dist") 

write_csv(pmantel_results %>% dplyr::select(-one_of("pval1","pval2")),
          "output/phylosymbiosis/pmantel_fulldata.csv")

## Plot mantels
gg.mantels <- bind_rows(mantel_results, pmantel_results) %>%
              dplyr::mutate(dist1 = case_when(
                str_detect(dist1, "plant.dist") ~ "Hostplant phylogeny",
                str_detect(dist1, "phylo.dist") ~ "Psyllid phylogeny",                
                str_detect(dist1, "spat.dist") ~ "Spatial distance"                   
              )) %>%
  filter(dist == "Aitchison") %>%
  mutate(dist1 = factor(dist1, levels= c("Spatial distance","Hostplant phylogeny",  "Psyllid phylogeny"))) %>%
  mutate(type = type %>%  
           str_replace("mantel", "Mantel") %>%
           str_replace("partial_Mantel", "Partial Mantel")) %>%
              ggplot(aes(x=mantelr, y=dist1, colour=dist1)) + 
    geom_vline(xintercept = 0, colour="black", linetype=2) +
  geom_pointrange(aes(xmin=`llim.2.5%`, xmax=`ulim.97.5%`), size=1) +
  scale_color_manual(values=c("Hostplant phylogeny"="#b2df8a","Spatial distance"="#a6cee3", "Psyllid phylogeny"="#1f78b4"))+
  facet_wrap(~type, ncol=2) +
  labs( x = "Mantel R", y = NULL, colour=NULL) +
  base_theme +
  theme(legend.position = "none",
        panel.grid.major = element_line())

gg.mantels

pdf(file="figs/fig3_mantels.pdf",  width = 8, height = 4, paper="a4r")
  plot(gg.mantels)
try(dev.off(), silent=TRUE)
  
```

### Without Gammaproteobacteria

```{r phylosymbiosis no entero}
# Adonis test
metadata <- sample_data(ps2_subset) %>%
  as("data.frame")

adonis_results <- distlist_subset %>%
  purrr::map(function(x) {
    y <- as.dist(x[metadata$Sample_Name, metadata$Sample_Name])
    bind_rows(
    broom::tidy(adonis2(y~seqrun+psyllid_family+psyllid_genus+psyllid_spp, method="euclidean", data=metadata, 
                       permutations=999)) %>% 
      mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-")),
    broom::tidy(adonis2(y~seqrun+hostplant_spp, method="euclidean", data=metadata,
                       permutations=999)) %>% 
      mutate(test = paste(term[!term %in% c("Residual", "Total")], collapse="-"))
    )
})  %>%
  bind_rows(.id="dist")

write_csv(adonis_results, "output/beta/adonis_subset.csv")

# Matrix correlations
#only use samples present in all
subsample <- Reduce(intersect, list(rownames(otu_table(ps2_subset)), colnames(phylo.dist), colnames(plant.dist), colnames(spat.dist)))

# Mantel test
mantel_results <- distlist_subset %>%
  purrr::map(function(x){
    run_mantel(x, dists=c("phylo.dist", "plant.dist", "spat.dist"),
               subsample=subsample, type="mantel")
  }) %>%
  bind_rows(.id="dist") 

write_csv(mantel_results %>% dplyr::select(-pval1, -pval2), "output/phylosymbiosis/mantel_subset.csv")

# Partial Mantel Test
pmantel_results <- distlist_subset %>%
  purrr::map(function(x){
    run_mantel(x, dists=c("phylo.dist", "plant.dist", "spat.dist"),
               subsample=subsample, type="partial")
  }) %>%
  bind_rows(.id="dist") 

write_csv(pmantel_results %>% dplyr::select(-pval1,-pval2), "output/phylosymbiosis/pmantel_subset.csv")
```

## PCA plots
```{r PCA}
#set distance
pca_dist <- distlist$Aitchison

#PCA 
r.pcx <- prcomp(pca_dist)
pc_samp <- data.frame(Sample_Name = rownames(r.pcx$x), r.pcx$x[, 1:2])%>%
  left_join(sample_data(ps2) %>%
              as("matrix") %>%
              as.data.frame(), by="Sample_Name")
# calculate percent variance explained for the axis labels
pc1 <- round(r.pcx$sdev[1]^2/sum(r.pcx$sdev^2),2)
pc2 <- round(r.pcx$sdev[2]^2/sum(r.pcx$sdev^2),2)
pc_ylab <- paste("PC1: ", pc1, sep="")
pc_xlab <- paste("PC2: ", pc2, sep="")

## colour by psyllid phylogeny
dend <- as.dendrogram(force.ultrametric(pruned.tree))
membership <- as.data.frame(cutree(dend, k=11)) %>%
  rownames_to_column("psyllid_spp") %>%
  magrittr::set_colnames(c("psyllid_spp", "cluster"))

pca_phylo <- pc_samp %>%
  left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) + 
  geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
  #geom_point(data=pc_otu,aes(PC1, PC2)) +
  theme_classic() +
  scale_colour_brewer(palette="Paired") +
  geom_hline(yintercept = 0, linetype=2, alpha=0.5) +  
  geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
  xlab(pc_xlab) + 
  ylab(pc_ylab) +
  theme(legend.position = "none") +
  scale_y_reverse(position = "right") 
  #coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained

p1 <- ggtree(pruned.tree) +
    scale_x_continuous(expand=c(0, 0.2)) + 
  theme_tree2() +
  theme(legend.position = "none") 

colours_p1 <- p1$data %>%
  left_join(membership %>%
  dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(new_label = label %>% str_replace_all("_", " "))

p1 <- p1 %<+% colours_p1  + 
  geom_tippoint(aes(colour=as.factor(cluster)))  + 
  geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
  scale_colour_brewer(palette="Paired") 

gg.psyllid_pca <- p1 + gg.pca + plot_annotation(title="Psyllid phylogenetic distance")
gg.psyllid_pca

pdf(file="figs/psyllid_pca.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.psyllid_pca)
try(dev.off(), silent=TRUE)
  
# Colour by plant phylogeny
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree  <- drop.tip(plant.tree, "Sophora_microphylla_-kowhai" )
plant.tree <- multi2di(plant.tree)

dend <- as.dendrogram(plant.tree )
#test <- color_branches(dend, k=12)
#plot(test)
membership <- as.data.frame(cutree(dend, k=12)) %>%
  rownames_to_column("hostplant_spp") %>%
  magrittr::set_colnames(c("hostplant_spp", "cluster"))

pca_phylo <- pc_samp %>%
  left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) + 
  geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
  #geom_point(data=pc_otu,aes(PC1, PC2)) +
  theme_classic() +
  scale_colour_brewer(palette="Paired") +
  geom_hline(yintercept = 0, linetype=2, alpha=0.5) +  
  geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
  xlab(pc_xlab) + 
  ylab(pc_ylab) +
  theme(legend.position = "none") +
  scale_y_reverse(position = "right")
  #coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained

p2 <- ggtree(plant.tree) +
  scale_x_continuous(expand=c(0, 70)) + 
  theme_tree2() +
  theme(legend.position = "none") 

colours_p2 <- p2$data %>%
  left_join(membership %>%
  dplyr::rename(label = hostplant_spp)
    )%>%
  mutate(new_label = label %>% str_replace_all("_", " "))

p2 <- p2 %<+% colours_p2  + 
  geom_tippoint(aes(colour=as.factor(cluster)))  + 
  geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
  scale_colour_brewer(palette="Paired") 

gg.plant_pca <- p2 + gg.pca + plot_annotation(title="Plant phylogenetic distance")
gg.plant_pca

pdf(file="figs/plant_pca.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.plant_pca)
try(dev.off(), silent=TRUE)

# HC of spatial distance
dend <- hclust(as.dist(spat.dist), method="average")
#test <- color_branches(dend, k=12)
#plot(test)

membership <- as.data.frame(cutree(dend, k=12)) %>%
  rownames_to_column("Sample_Name") %>%
  magrittr::set_colnames(c("Sample_Name", "cluster"))

pca_phylo <- pc_samp %>%
  left_join(membership)
gg.pca <- ggplot(data=pca_phylo, aes(x=PC2, y=PC1, colour=as.factor(cluster))) + 
  geom_point(alpha=0.5, size=3) + #, shape=21, colour="black"
  #geom_point(data=pc_otu,aes(PC1, PC2)) +
  theme_classic() +
  scale_colour_brewer(palette="Paired") +
  geom_hline(yintercept = 0, linetype=2, alpha=0.5) +  
  geom_vline(xintercept = 0, linetype=2, alpha=0.5) +
  xlab(pc_xlab) + 
  ylab(pc_ylab) +
  theme(legend.position = "none") +
  scale_y_reverse(position = "right")
  #coord_fixed(ratio=pc2/pc1) # Scale plot by variance explained

p3 <- ggtree(as.phylo(dend) )+
  scale_x_continuous(expand=c(0, 400)) +
  theme_tree2() +
  theme(legend.position = "none") 

colours_p3 <- p3$data %>%
  left_join(membership %>%
  dplyr::rename(label = Sample_Name)
    )%>%
  mutate(new_label = label %>% str_replace_all("_", " "))

p3 <- p3 %<+% colours_p3  + 
  geom_tippoint(aes(colour=as.factor(cluster)))  + 
  #geom_tiplab(aes(colour=as.factor(cluster), label=new_label))+
  scale_colour_brewer(palette="Paired") 

gg.spat_pca <- p3 + gg.pca + plot_annotation(title="Spatial Distance")
gg.spat_pca

pdf(file="figs/spatial_pca.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.spat_pca)
try(dev.off(), silent=TRUE)
```

# Cophylogeny

Run the scripts in the R subdirectory

```{bash submit paco jobs}
cd /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/paco_para
dos2unix batch_submit_paco.sh
find $(/usr/bin/ls -d $PWD/*) -name 'paco_*' -type f | grep -v '.rds' > sequence_index.txt
joblength=$(cat sequence_index.txt | wc -l)
sbatch --array=1-$joblength batch_submit_paco.sh
```

# Microbiome 3 genera heatmap

```{r all microbe}
filterfun1 <- function(x){
  x[(x / sum(x)) < (1e-4)] <- 0 #1e-4 is 0.01% threshold
  return(x)
}
ps3_filt  <- transform_sample_counts(ps3, fun = filterfun1)%>%
  filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table

print(paste0((ntaxa(ps3)-ntaxa(ps3_filt)), " taxa under threshold removed"))

#Get co-occurance matrix
coocur <- ps3_filt %>%
    subset_samples(psyllid_genus %in% c("Powellia", "Ctenarytaina", "Psylla")) %>%
    filter_taxa(function(x) mean(x) > 0, TRUE) %>%
    otu_table %>%
    as.matrix() %>% 
  apply(2, function(x) ifelse(x > 0, 1, 0))

colnames(coocur) <- colnames(coocur) %>%
           str_replace_all(" |-", "_")
rownames(coocur) <- rownames(coocur) %>% 
           str_replace_all(" |-", "_")

# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))

# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))

coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# Read in Paco runs 
paco_run_powellia <- readRDS("output/cophylogeny/paco_powellia_microbe_filtered_asym.rds")
paco_run_cten <- readRDS("output/cophylogeny/paco_ctenarytaina_microbe_filtered_asym.rds")
paco_run_psylla <- readRDS("output/cophylogeny/paco_psylla_microbe_filtered_asym.rds")

paco_run_powellia$gof
paco_run_cten$gof
paco_run_psylla$gof

z_trans <- function(x){
  (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
}

# Get link importance
links <- do.call("list", mget(grep("paco_run_",names(.GlobalEnv),value=TRUE))) %>%
  purrr::map(function(x){
    genus_name <- rownames(x$H)[1] %>% str_remove("_.*$")
    data.frame(
      joint=names(x$jackknife),
      values=unname(x$jackknife),
      #upper=unname(x$jackknife$upper),
      genus = genus_name
    )
  }) %>%
  bind_rows() %>%
 separate(joint, into=c("Sample_Name", "OTU"), sep="-", extra="merge", remove=FALSE) %>%
  group_by(genus) %>%
  mutate(values = values^2)%>%
  mutate(mean_val = mean(values)) %>%
  mutate(signif = case_when(
    values < mean_val ~ 1,
    values > mean_val ~ 0
  ))%>%
  group_by(genus) %>%
  mutate(
    #st_dev = sd(values),
    # z_values =  values/st_dev
    z_values =  z_trans(values)
    ) %>%
  ungroup()

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=reorder_within(label, values, genus))%>%
  #arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) + #
  geom_point(show.legend = FALSE) +
  facet_wrap(~genus, scales = "free")+
  #geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(aes(yintercept = mean_val), lty=3) +
  scale_x_reordered()+
  scale_color_manual(values=c("steelblue", "darkorange1"), name = "Significant") +
  theme_classic()+ 
  theme(axis.text.x = element_blank(), legend.position = "right")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- do.call("list", mget(grep("paco_run_",names(.GlobalEnv),value=TRUE))) %>%
  purrr::map(function(x){
    res <- residuals_paco(x$proc, type = "interaction")
    genus_name <- rownames(x$H)[1] %>% str_remove("_.*$")
    data.frame(
      OTU=names(res),
      values=unname(res),
      genus=genus_name
    ) 
  })%>%
  bind_rows() 

# Plot residuals
ggplot(paco_residuals, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  facet_grid(genus~.) +
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run_powellia <- readRDS("output/cophylogeny/parafit_powellia_microbe_filtered.rds")
PF_run_cten <- readRDS("output/cophylogeny/parafit_ctenarytaina_microbe_filtered.rds")
PF_run_psylla <- readRDS("output/cophylogeny/parafit_psylla_microbe_filtered.rds")

PF_run_powellia$ParaFitGlobal
PF_run_powellia$p.global

PF_run_cten$ParaFitGlobal
PF_run_cten$p.global

PF_run_psylla$ParaFitGlobal
PF_run_psylla$p.global

# Joining isnt working here because they were made separately - so getting the rownames from cooccur no logner works
PF_links <- do.call("list", mget(grep("PF_run_",names(.GlobalEnv),value=TRUE))) %>%
  purrr::map(function(x){
    genus_name <- names(x$para.per.host[1]) %>% str_remove("_.*$")
    as.data.frame(x$link.table) %>%
      left_join(enframe(names(x$para.per.host), name = "Host", value="psyllid_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(x$host.per.para), name = "Parasite", value="OTU") %>%
                  mutate(Parasite = as.numeric(Parasite))) %>%
      mutate(genus = genus_name)
  }) %>%
  bind_rows() %>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) %>%
  group_by(genus) %>%
  mutate(z_values =  z_trans(F1.stat)) %>%
  ungroup()

# Proportion of significant links
PF_links %>%
  group_by(genus) %>%
  summarise(s = count(signif > 0), ns = count(signif == 0)) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=FALSE) 

tree1 <- obj[["trees"]][[1]]
p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
  left_join(links %>%
    group_by(Sample_Name) %>%
    summarise(values = mean(signif)) %>%
    #summarise(values = mean(z_values)) %>%
    dplyr::rename(label = Sample_Name)
    )  

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))

p1 <- p1 %<+% weights_p1 +
  scale_color_gradient(low="steelblue", high="darkorange1") +
  #  scale_colour_gradient(high="steelblue", low="darkorange1", trans="log10") +
    theme(legend.position = "none")

# OTU tree
tree2 <- obj[["trees"]][[2]]

# make a clade label list
tax_groups <- as.data.frame(tax_table(ps3)) %>%
  rownames_to_column("label") %>%
  dplyr::mutate(label = label %>% str_replace_all("-", "_") %>%
                  str_replace_all(" ", "_")) %>%
  dplyr::select(label, genus) %>%
  filter(label %in% tree2$tip.label) %>%
  group_by(genus) 

group_name <- group_keys(tax_groups)  %>%
  mutate(group_name = genus %>% str_remove_all("\\[|\\]"))

cls <- tax_groups %>% 
  group_split() %>% 
  purrr::map(pull, label) %>%
  set_names(group_name$group_name)

newtree <- groupOTU(tree2, cls)
p2 <- ggtree(newtree, ladderize=TRUE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(OTU) %>%
    summarise(values = mean(signif)) %>%
    #summarise(values = mean(z_values)) %>%
    dplyr::rename(label = OTU)
    ) 

#Should be able to use castor to make this faster
weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  +  
  #scale_colour_gradient(high="steelblue", low="darkorange1", trans="log10") +
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Plot heatmaps
heatmap_dat <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  mutate(genus = label.x %>% str_remove("_.*$")) %>%
  left_join(links %>% 
              dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif, val_paco = z_values)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif, val_para = z_values)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  dplyr::rename(Sample_Name = label.x,
                OTU = label.y,
                pos_x = y.x,
                pos_y = y.y) %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(highlight = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) 

gg.heatmap <- heatmap_dat %>%
    mutate(OTU = OTU %>% str_replace_all("_", " "),
         Sample_Name =  Sample_Name %>% str_replace_all("_"," ")) %>%
  dplyr::select(Sample_Name, OTU, genus, highlight, pos_x, pos_y, val_para, val_paco) %>%
  dplyr::mutate(Sample_Name = factor(Sample_Name),
                OTU = factor(OTU),
                genus=factor(genus, levels=c("Psylla" ,"Powellia", "Ctenarytaina")))  %>%
   ggplot(aes(x= fct_reorder(Sample_Name, pos_x), y=fct_reorder(OTU, pos_y), fill=highlight )) +
  geom_tile() +
  theme_bw() +
  facet_grid(~genus, drop = TRUE, scales="free", space ="free")+
  theme(axis.text.x = element_text(angle=45, hjust=1),
        axis.title.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "none") +
  scale_y_discrete(expand=c(0,0))+
  scale_y_discrete(expand=c(0,0)) +
  #scale_fill_gradient(high="steelblue", low="darkorange1") 
  scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE) 

gg.heatmap

# Density plot of mismatch
density_dat <-  heatmap_dat %>%
  left_join(p2$data %>% dplyr::select(OTU = label, y)) %>%
  group_by(OTU, y) %>%
  summarise(sum_paco = sum(signif_paco, na.rm=TRUE), sum_para = sum(signif_para, na.rm=TRUE)) %>%
  mutate(total_sum = sum(sum_paco, sum_para, na.rm=TRUE)) %>%
  #summarise(total_sum = mean(val_paco, na.rm=TRUE)) %>%
  ungroup()

density_labels <- density_dat %>%
  left_join(tax_table(ps3) %>%
              as("matrix") %>%
              as_tibble(rownames="OTU") %>% 
              mutate(OTU = OTU %>% str_replace_all(" |-", "_")))%>%
  arrange(y) 
  
chunk = 50
n <- nrow(density_labels)
r  <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
density_labels_chunked <- split(density_labels,r)

density_labels <- density_labels_chunked %>% 
  purrr::map(function(x){
    df <- x %>%
      mutate(zscore = (total_sum - mean(total_sum, na.rm=TRUE))/sd(total_sum, na.rm=TRUE)) %>%
      filter( total_sum > 3, zscore  > 3,) #, 
    if(nrow(df) > 0){
        out <- df %>%
        summarise(y = mean(y), total_sum  = max(total_sum), annot =  case_when(
        length(unique(genus)) == 1 ~ names(which.max(table(genus))),
        length(unique(genus)) > 1~  names(which.max(table(family)))))
        return(out)
    }
  }) %>%
  bind_rows() 

gg.density <- density_dat %>%
  filter(total_sum > 0) %>%
  ggplot(aes(x = y, y=total_sum, colour=total_sum)) +
  geom_point(size=0.01, alpha=1)+
  scale_color_gradient(low="steelblue", high="darkorange1") +
    geom_text(data=density_labels, aes(label = annot), hjust=0) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_void() +
  theme(legend.position = "none")+
  coord_flip()

gg.density

# Instead of density could i label lineages with the greatest value?

top <- wrap_elements(grid::textGrob('')) +(p1+ coord_flip() + scale_x_reverse(expand=c(0,0))+ scale_y_continuous(expand=c(0,0))) + wrap_elements(grid::textGrob('')) +plot_layout(widths=c(0.5,3, 0.1)) 

bottom <- p2+ scale_y_continuous(expand=c(0,0)) + gg.heatmap + gg.density + plot_layout(widths=c(0.5,3, 0.1))

gg.treemap <- top / bottom + plot_layout(heights=c(0.5,3))  

gg.treemap

pdf(file="figs/3genus_heatmaps.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.treemap)
try(dev.off(), silent=TRUE)
  
# Make plot ranking links, and nodes

gg.host_links <- heatmap_dat %>%
  dplyr::rename(label = Sample_Name) %>%
  drop_na() %>%
  group_by(label) %>%
  summarise(signif_paco = sum(signif_paco), signif_para= sum(signif_para)) %>%
  pivot_longer(starts_with("signif_"),
               names_to="type",
               values_to="values") %>%
  mutate(type = type %>% str_remove("signif_")) %>%
    mutate(label = label %>% 
             str_replace_all("_", " ") %>%
             str_replace(" sp", " sp."))%>%
  mutate(label = as.factor(label))%>%
  filter(values > 0) %>%
  arrange(-values) %>%
  ggplot(aes(x=fct_reorder(label, values, sum ), y=values, fill=type)) +
  geom_col() +  
  scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
  theme_classic()+ 
  theme(axis.text.y = element_text(face="italic"),
        legend.position = "bottom")+
  labs(y = "Number of Signficant links", x=NULL, title ="Psyllid taxa") + 
  coord_flip()

gg.sym_links <- heatmap_dat %>%
  dplyr::rename(label = OTU) %>%
  left_join(tax_groups %>% dplyr::rename(otu_genus = genus)) %>%
  drop_na() %>%
  group_by(otu_genus) %>%
  summarise(signif_paco = sum(signif_paco), signif_para= sum(signif_para)) %>%
  pivot_longer(starts_with("signif_"),
               names_to="type",
               values_to="values") %>%
  dplyr::rename(label = otu_genus) %>%
  dplyr::mutate(label = label %>% 
                  str_remove("\\/.*$") %>%
                  str_replace("NA_canariense", "Bradyrhizobium_canariense") %>%
                  str_remove("^s__")
                  ) %>%
  mutate(type = type %>% str_remove("signif_")) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum ))%>%
  group_by(label) %>%
  mutate(total = sum(values)) %>%
  ungroup() %>%
   filter(total > 0) %>%
  top_n(80, total) %>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, fill=type)) +
  geom_col() +  
  scale_fill_manual(values=c("NS"="steelblue","paco"="darkorange1","para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
  theme_classic()+ 
  theme(axis.text.y = element_text(face="italic"),
        legend.position = "bottom")+
  labs(y = "Number of Signficant links", x=NULL, title ="Microbial genera") + 
  coord_flip()

gg.host_links + gg.sym_links

pdf(file="figs/3genus_fit_summary.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.host_links + gg.sym_links)
try(dev.off(), silent=TRUE)

```

# Tanglegrams

## Psyllid ~ Carsonella 

```{r carsonella cophylo}
#Flag top abundance carsonella by sample
top_carson <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  filter(genus=="Candidatus Carsonella") %>%
  group_by(Sample) %>%  
  filter(Abundance > 0) %>%
  top_n(1, wt=Abundance) %>%
  mutate(top = TRUE) 

coocur <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
    filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  left_join(top_carson) %>%
  filter(genus=="Candidatus Carsonella", top==TRUE) %>%
  dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
  mutate(OTU = OTU %>%
           str_replace_all(" |-", "_")) %>%
  dplyr::group_by(OTU, psyllid_spp) %>%
    summarise(Abundance = sum(Abundance)) %>%
  pivot_wider(id_cols = psyllid_spp,
              names_from = OTU,
              values_from=Abundance,
              values_fill = list(Abundance = 0))  %>%
  column_to_rownames("psyllid_spp") %>%
    as.matrix() %>% 
  apply(2, function(x) ifelse(x > 0, 1, 0)) 

# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO

coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)

# Add pcord
D <- add_pcoord(D, correction='none') 
p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")
p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)

#print overall significance
print(paco_run$gof)

# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife), #losing sv50 here?
                    values=unname(paco_run$jackknife)#,
                    #upper=unname(paco_run$jackknife)
                    ) %>%
  mutate(OTU = joint) %>%
 separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
  #geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_blank())

dir.create("output/cophylogeny/psyllid_carsonella")
write_csv(links, "output/cophylogeny/psyllid_carsonella/psyllid_carsonella_links.csv")
write_csv(links %>% 
    group_by(OTU) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_carsonella/carsonella_weights.csv")
write_csv(links %>% 
    group_by(Sample_Name) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_carsonella/psyllid_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")

# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") 
ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global

PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# psyllid_tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1)
atmeto_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(node)
root_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(parent)

tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(Sample_Name) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = Sample_Name)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root

p1$data$node[p1$data$node]

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(OTU) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    )%>%
  left_join(PF_links %>% 
    group_by(OTU) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    ) %>%
  mutate(values = PA_values + PF_values)

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y)) %>%
  mutate(label = label %>% str_replace_all("_", " ")) 


gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    theme_void() +
    scale_y_continuous(expand=c(0.005,0.005))+
    theme(legend.position = "bottom") +
  labs(colour="Significance:")

gg.carson_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.carson_tangle 

pdf(file="figs/carsonella_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.carson_tangle)
try(dev.off(), silent=TRUE)
```

## Psyllid ~ Sodalis

```{r Sodalis cophylo}
#Flag top abundance sodalis by sample
top_sodalis <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  filter(genus=="Sodalis") %>%
  group_by(Sample) %>%  
  filter(Abundance > 0) %>%
  top_n(1, wt=Abundance) %>%
  mutate(top = TRUE) 

coocur <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  left_join(top_sodalis) %>%
  filter(genus=="Sodalis", top==TRUE) %>%
  dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
  mutate(OTU = OTU %>%
           str_replace_all(" |-", "_")) %>%
  dplyr::group_by(OTU, psyllid_spp) %>%
    summarise(Abundance = sum(Abundance)) %>%
  pivot_wider(id_cols = psyllid_spp,
              names_from = OTU,
              values_from=Abundance,
              values_fill = list(Abundance = 0))  %>%
  column_to_rownames("psyllid_spp") %>%
    as.matrix() %>% 
  apply(2, function(x) ifelse(x > 0, 1, 0)) 

coocur <- coocur[ rowSums(coocur) > 0,colSums(coocur) > 0]

# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO

#alternatively - sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none') 

p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")

p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=TRUE)

#print overall significance
print(paco_run$gof)

# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife), #losing sv50 here?
                    values=unname(paco_run$jackknife)#,
                    #upper=unname(paco_run$jackknife$upper)
                    ) %>%
  mutate(OTU = joint) %>%
 separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
 # geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_blank())

dir.create("output/cophylogeny/psyllid_secondary")
write_csv(links, "output/cophylogeny/psyllid_secondary/psyllid_secondary_links.csv")
write_csv(links %>% 
    group_by(OTU) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/secondary_symbiont_weights.csv")
write_csv(links %>% 
    group_by(Sample_Name) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/psyllid_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") 

ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global

PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# psyllid_tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1)
atmeto_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(node)
root_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(parent)

tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(Sample_Name) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = Sample_Name)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root

p1$data$node[p1$data$node]

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(OTU) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    )%>%
  left_join(PF_links %>% 
    group_by(OTU) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    ) %>%
  mutate(values = PA_values + PF_values)

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y))%>%
  mutate(label = label %>% str_replace_all("_", " ")) 


gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    theme_void() +
    scale_y_continuous(expand=c(0.01,0.01))+
    theme(legend.position = "bottom") +
  labs(colour="Significance:")

gg.sodalis_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.sodalis_tangle

pdf(file="figs/sodalis_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.sodalis_tangle)
try(dev.off(), silent=TRUE)
```


## Psyllid ~ Arsenophonus

```{r Arsenophonus cophylo}
# Subset to top arsenophonus
top_arse <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  filter(genus=="Arsenophonus") %>%
  group_by(Sample) %>%  
  filter(Abundance > 0) %>%
  top_n(1, wt=Abundance) %>%
  mutate(top = TRUE) 

coocur <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  left_join(top_arse) %>%
  filter(genus=="Arsenophonus", top==TRUE) %>%
  dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
  mutate(OTU = OTU %>%
           str_replace_all(" |-", "_")) %>%
  dplyr::group_by(OTU, psyllid_spp) %>%
    summarise(Abundance = sum(Abundance)) %>%
  pivot_wider(id_cols = psyllid_spp,
              names_from = OTU,
              values_from=Abundance,
              values_fill = list(Abundance = 0))  %>%
  column_to_rownames("psyllid_spp") %>%
    as.matrix() %>% 
  apply(2, function(x) ifelse(x > 0, 1, 0)) 

coocur <- coocur[ rowSums(coocur) > 0,colSums(coocur) > 0]

# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO

#alternatively - sqrt(cophenetic(s_tree))
coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none') 

p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")

p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)

#print overall significance
print(paco_run$gof)

# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife),
                    values=unname(paco_run$jackknife)#,
                    #upper=unname(paco_run$jackknife$upper)
                    ) %>%
  mutate(OTU = joint) %>%
 separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
  geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_blank())

dir.create("output/cophylogeny/psyllid_secondary")
write_csv(links, "output/cophylogeny/psyllid_secondary/psyllid_secondary_links.csv")
write_csv(links %>% 
    group_by(OTU) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/secondary_symbiont_weights.csv")
write_csv(links %>% 
    group_by(Sample_Name) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_secondary/psyllid_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") 

ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist,coocur, nperm=999, test.links=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global

PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
library(phytools)
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# psyllid_tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1)
atmeto_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(node)
root_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(parent)

tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(Sample_Name) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = Sample_Name)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root

p1$data$node[p1$data$node]

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=FALSE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(OTU) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    )%>%
  left_join(PF_links %>% 
    group_by(OTU) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    ) %>%
  mutate(values = PA_values + PF_values)

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y))%>%
  mutate(label = label %>% str_replace_all("_", " ")) 


gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    theme_void() +
    scale_y_continuous(expand=c(0.005,0.005))+
    theme(legend.position = "bottom") +
  labs(colour="Significance:")
gg.arse_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.arse_tangle

pdf(file="figs/arsenophonus_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.arse_tangle)
try(dev.off(), silent=TRUE)
```

## Psyllid ~  hostplant

Tanglegram of all plants and  psyllids!
```{r hostplant-psyllid}
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree  <- drop.tip(plant.tree , "Sophora_microphylla_-kowhai" )

#Prepare co-occurance matrix
coocur <- sample_data(ps2) %>%
  as_tibble() %>%
  dplyr::select(psyllid_spp, hostplant_spp) %>%
  filter(!is.na(psyllid_spp)) %>%
  unique() %>%
  mutate(psyllid_spp = psyllid_spp %>%
           str_replace_all(" |-", "_"),
         hostplant_spp = hostplant_spp %>%
           str_replace_all(" |-", "_"),
         presence = 1
         ) %>%
  pivot_wider(names_from = "hostplant_spp",
              values_from="presence",
              values_fill = list(presence = 0)) %>%
  column_to_rownames("psyllid_spp") %>%
  t()

# H cophenetic distance
h_tree <- plant.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- pruned.tree
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree))

coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none') 

p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")

p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)

#print overall significance
print(paco_run$gof)

# Get interaction-specific cophylogenetic contributions using leave-one-out jacknifing
paco_run <- paco_links(paco_run, .parallel = TRUE)

# get links
links <- data.frame(
  joint=names(paco_run$jackknife),
  values=unname(paco_run$jackknife)#,
  #upper=unname(paco_run$jackknife$upper)
  ) %>%
 mutate(OTU = joint) %>%
 separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))
# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
 # geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_blank())

dir.create("output/cophylogeny/psyllid_hostplant")
write_csv(links, "output/cophylogeny/psyllid_hostplant/psyllid_hostplant_links.csv")
write_csv(links %>% 
    group_by(psyllid_spp) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_hostplant/psyllid_weights.csv")
write_csv(links %>% 
    group_by(hostplant_spp) %>%
    summarise(values = mean(values)), "output/cophylogeny/psyllid_hostplant/hostplant_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")

# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge")

ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  #facet_wrap(~psyllid_genus) +
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=FALSE)
PF_run$ParaFitGlobal
PF_run$p.global

PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="hostplant_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="psyllid_spp") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# Extract the goods for ggtree
# plant tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(hostplant_spp) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = hostplant_spp)
    )%>%
  left_join(PF_links %>% 
    group_by(hostplant_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = hostplant_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# psyllid_tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))

p2 <- ggtree(tree2)
atmeto_node <- p2$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(node)
root_node <- p2$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(parent)

tree2 <- groupOTU(tree2, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled

p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links, linetype=group)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(psyllid_spp) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p2$data[p2$data$node %in% atmeto_node, "x"] <- max(p2$data$x)
p2$data[p2$data$node %in% root_node, "x"] <- 0.2 #root

p2$data$node[p2$data$node]

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y))%>%
  mutate(label = label %>% str_replace_all("_", " ")) 

gg.tangle <- ggplot(tangle, aes(x=factor(tree, levels=c("microbe", "host")), y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    scale_y_continuous(expand=c(0.005,0.005))+
    theme_void() +
    theme(legend.position = "bottom") +
  labs(colour="Significance")

gg.plant_tangle <- p2 + gg.tangle + (p1 + scale_x_reverse()) #+ plot_layout(widths = c(2, 1, 2))
gg.plant_tangle

pdf(file="figs/plant_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.plant_tangle)
try(dev.off(), silent=TRUE)
```

# Powellia

## Powellia ~ Carsonella

```{r Powellia carsonella}
#Flag top abundance carsonella by sample
top_carson <- ps2 %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  filter(genus=="Candidatus Carsonella") %>%
  group_by(Sample) %>%  
  filter(Abundance > 0) %>%
  top_n(1, wt=Abundance) %>%
  mutate(top = TRUE) 

coocur <- ps2 %>%
  subset_samples(psyllid_genus == "Powellia") %>%
  filter_taxa(function(x) mean(x) > 0, TRUE) %>%
  transform_sample_counts(function (x) x/sum(x)) %>%
  speedyseq::psmelt() %>%
  filter(psyllid_spp %in% pruned.tree$tip.label) %>%
  left_join(top_carson) %>%
  filter(genus=="Candidatus Carsonella", top==TRUE) %>%
  dplyr::select(OTU, psyllid_spp, SampleID, Abundance) %>%
  mutate(OTU = OTU %>%
           str_replace_all(" |-", "_")) %>%
  dplyr::group_by(OTU, psyllid_spp) %>%
    summarise(Abundance = sum(Abundance)) %>%
  pivot_wider(id_cols = psyllid_spp,
              names_from = OTU,
              values_from=Abundance,
              values_fill = list(Abundance = 0))  %>%
  column_to_rownames("psyllid_spp") %>%
    as.matrix() %>% 
  apply(2, function(x) ifelse(x > 0, 1, 0)) 

# H cophenetic distance
h_tree <- pruned.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- phy_tree(ps3)
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree)/1e+6 ) ##convert to Mya so integers are small enough for PACO

coocur <- coocur[h_tree$tip.label, s_tree$tip.label]

# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none') 

p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")

p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE)

#print overall significance
print(paco_run$gof)

# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(joint=names(paco_run$jackknife), 
                    values=unname(paco_run$jackknife)#, 
                    #upper=unname(paco_run$jackknife$upper)
                    ) %>%
  mutate(OTU = joint) %>%
 separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
  #geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_blank())

dir.create("output/cophylogeny/trioza_carsonella")
write_csv(links, "output/cophylogeny/trioza_carsonella/trioza_carsonella_links.csv")
write_csv(links %>% 
    group_by(OTU) %>%
    summarise(values = mean(values)), "output/cophylogeny/trioza_carsonella/carsonella_weights.csv")
write_csv(links %>% 
    group_by(Sample_Name) %>%
    summarise(values = mean(values)), "output/cophylogeny/trioza_carsonella/psyllid_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")
# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("Sample_Name", "OTU"), sep="-", extra="merge") 

ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=FALSE)
PF_run$ParaFitGlobal
PF_run$p.global

PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="psyllid_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="OTU") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# psyllid_tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1)
atmeto_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(node)
root_node <- p1$data %>% 
  filter(str_detect(label, "Atmeto")) %>%
  pull(parent)

tree1 <- groupOTU(tree1, .node=atmeto_node) # Make the atmeto dotted to indicate outgroup was rescaled

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links,linetype=group))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(Sample_Name) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = Sample_Name)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p1$data[p1$data$node %in% atmeto_node, "x"] <- max(p1$data$x)
p1$data[p1$data$node %in% root_node, "x"] <- 0.2 #root

p1$data$node[p1$data$node]

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# OTU tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))
p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(OTU) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    )%>%
  left_join(PF_links %>% 
    group_by(OTU) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = OTU)
    ) %>%
  mutate(values = PA_values + PF_values)

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = Sample_Name, label.y = OTU, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = psyllid_spp, label.y = OTU, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y))%>%
  mutate(label = label %>% str_replace_all("_", " ")) 


gg.tangle <- ggplot(tangle, aes(x=tree, y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    theme_void() +
    scale_y_continuous(expand=c(0.005,0.005))+
    theme(legend.position = "bottom") +
  labs(colour="Significance:")

gg.trioza_carson_tangle <- p1 + gg.tangle + (p2 + scale_x_reverse()) 
gg.trioza_carson_tangle

pdf(file="figs/powellia_carsonella_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.trioza_carson_tangle)
try(dev.off(), silent=TRUE)
```

## Powellia ~ hostplant

```{r trioza hostplant psyllid}
plant.tree <- read.tree("sample_data/plant_tree.nwk")
plant.tree  <- drop.tip(plant.tree , "Sophora_microphylla_-kowhai" )

#Prepare co-occurance matrix
coocur <- sample_data(ps2) %>%
  as_tibble() %>%
  filter(psyllid_genus == "Powellia") %>%
  dplyr::select(psyllid_spp, hostplant_spp) %>%
  filter(!is.na(psyllid_spp)) %>%
  unique() %>%
  mutate(psyllid_spp = psyllid_spp %>%
           str_replace_all(" |-", "_"),
         hostplant_spp = hostplant_spp %>%
           str_replace_all(" |-", "_"),
         presence = 1
         ) %>%
  pivot_wider(names_from = "hostplant_spp",
              values_from="presence",
              values_fill = list(presence = 0)) %>%
  column_to_rownames("psyllid_spp") %>%
  t()

# H cophenetic distance
h_tree <- plant.tree
h_tree$tip.label <- h_tree$tip.label %>%
           str_replace_all(" |-", "_")
h_tree <- drop.tip(h_tree, setdiff(h_tree$tip.label, rownames(coocur)))
h_dist <- sqrt(cophenetic(h_tree))

# P cophenetic distance
s_tree <- pruned.tree
s_tree$tip.label <- s_tree$tip.label %>%
           str_replace_all(" |-", "_")
s_tree <- drop.tip(s_tree, setdiff(s_tree$tip.label, colnames(coocur)))
s_dist <- sqrt(cophenetic(s_tree))

coocur <- coocur[h_tree$tip.label, s_tree$tip.label]
# prepare paco data
D <- prepare_paco_data(H=h_dist, P=s_dist, HP=coocur)
# Add pcord
D <- add_pcoord(D, correction='none')  

p_host <- ggplot(D$H_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2)) +
    geom_point() +
    theme_bw() +
  ggtitle("H PCA")

p_para <- ggplot(D$P_PCo[,c('Axis.1', 'Axis.2')] %>% as.data.frame, aes(Axis.1, Axis.2))  +
    geom_point() +
    theme_bw()+
  ggtitle("P PCA")
    
plot(p_host + p_para)

# run paco
paco_run <- PACo(D, nperm=999, seed=909, method='quasiswap', symmetric=FALSE) #Symetric - is one meant to track the evolution of another?

#print overall significance
print(paco_run$gof)

# Procrustes diagnostic plots
plot(paco_run$proc)
plot(paco_run$proc, kind=2)

# Get interaction-specific cophylogenetic contributions using jacknifing
paco_run <- paco_links(paco_run)
links <- data.frame(
  joint=names(paco_run$jackknife),
  values=unname(paco_run$jackknife)#,
  #upper=unname(paco_run$jackknife$upper)
  ) %>%
 mutate(OTU = joint) %>%
 separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge") %>%
  mutate(signif = case_when(
    values < mean(.$values) ~ 1,
    values > mean(.$values) ~ 0
  ))

# Plot links
links %>%
  dplyr::rename(label = joint) %>%
  mutate(label = as.factor(label),
         label=fct_reorder(label, values, sum))%>%
  arrange(-values) %>%
  ggplot(aes(x=label, y=values, colour=as.factor(signif))) +
  geom_point(show.legend = FALSE) +
  #geom_errorbar(aes(ymin=values, ymax=upper)) +
  geom_hline(yintercept = mean(links$values)) +
  scale_color_manual(values=c("steelblue", "darkorange1")) +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle=45, hjust=1))

dir.create("output/cophylogeny/trioza_hostplant")
write_csv(links, "output/cophylogeny/trioza_hostplant/psyllid_hostplant_links.csv")
write_csv(links %>% 
    group_by(psyllid_spp) %>%
    summarise(values = mean(values)), "output/cophylogeny/trioza_hostplant/psyllid_weights.csv")
write_csv(links %>% 
    group_by(hostplant_spp) %>%
    summarise(values = mean(values)), "output/cophylogeny/trioza_hostplant/hostplant_weights.csv")

#Get observed residuals of Procrustean superimposition 
paco_residuals <- residuals_paco(paco_run$proc, type = "interaction")

# Visualise residuals
res <- data.frame(OTU=names(paco_residuals), values=unname(paco_residuals)) %>%
  separate(OTU, into=c("hostplant_spp", "psyllid_spp"), sep="-", extra="merge")

ggplot(res, aes(x=values))+
  geom_density(fill='grey70')+
  theme_bw()+
  xlab('Procrustes residuals')+
  ylab('Frequency')

# Parafit run
PF_run <- parafit(h_dist, s_dist, t(coocur), nperm=999, test.links=TRUE, silent=TRUE)
PF_run$ParaFitGlobal
PF_run$p.global


PF_links <- as.data.frame(PF_run$link.table)  %>%
      left_join(enframe(names(PF_run$para.per.host), name = "Host", value="hostplant_spp") %>%
                  mutate(Host = as.numeric(Host))) %>%
      left_join(enframe(names(PF_run$host.per.para), name = "Parasite", value="psyllid_spp") %>%
                  mutate(Parasite = as.numeric(Parasite)))%>%
  mutate(signif = case_when(
    p.F1 < 0.05 ~ 1,
    p.F1 > 0.05 ~ 0
  )) 

# Cophyloplot
coocur.lut <- which(coocur ==1, arr.ind=TRUE)
assoc <- cbind(rownames(coocur)[coocur.lut[,1]], colnames(coocur)[coocur.lut[,2]])

# Rotate the nodes using phytools
obj <- cophylo(tr1=h_tree, tr2=s_tree, assoc=assoc, rotate=TRUE) 

# Extract the goods for ggtree
# plant tree
tree1 <- obj[["trees"]][[1]]

p1 <- ggtree(tree1, ladderize=FALSE, aes(colour=links))
weights_p1 <- p1$data %>%
  left_join(links %>% 
    group_by(hostplant_spp) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = hostplant_spp)
    )%>%
  left_join(PF_links %>% 
    group_by(hostplant_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = hostplant_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

## Get values for higher nodes
weights_p1 <- weights_p1 %>%
  left_join(data.frame(node=weights_p1$node, links = weights_p1$node %>% purrr::map_dbl(average_descendants, tree=tree1, df=weights_p1)))
p1 <- p1 %<+% weights_p1 + geom_tippoint(aes(colour=links)) +
  scale_color_gradient(low="steelblue", high="darkorange1")  +
  theme(legend.position = "none")

# psyllid_tree
tree2 <- obj[["trees"]][[2]]
s_tree <- drop.tip(tree2, setdiff(tree2$tip.label, obj$assoc[,2]))

p2 <- ggtree(tree2 , ladderize=TRUE, aes(colour=links)) 
weights_p2 <- p2$data %>%
  left_join(links %>% 
    group_by(psyllid_spp) %>%
    summarise(PA_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    )%>%
  left_join(PF_links %>% 
    group_by(psyllid_spp) %>%
    summarise(PF_values = mean(signif)) %>%
    dplyr::rename(label = psyllid_spp)
    ) %>%
  mutate(values = PA_values + PF_values)

#Scale the atmetocranium and root nodes to be shorter
p2$data[p2$data$node %in% atmeto_node, "x"] <- max(p2$data$x)
p2$data[p2$data$node %in% root_node, "x"] <- 0.2 #root

p2$data$node[p2$data$node]

weights_p2 <- weights_p2 %>%
  left_join(data.frame(node=weights_p2$node, links = weights_p2$node %>% purrr::map_dbl(average_descendants, tree=tree2, df=weights_p2)))

p2 <- p2 %<+% weights_p2  + geom_tippoint(aes(colour=links)) + 
  scale_color_gradient(low="steelblue", high="darkorange1") +
  theme(legend.position = "none")

# Tanglegram 
tangle <- obj$assoc %>%
  as_data_frame() %>%
  magrittr::set_colnames(c("label.x", "label.y")) %>%
  left_join(links %>% 
              dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_paco=signif)) %>%
  left_join(PF_links %>% 
              dplyr::select(label.x = hostplant_spp, label.y = psyllid_spp, signif_para=signif)) %>%
  left_join(p1$data %>% dplyr::select(label, y) %>% dplyr::rename(label.x = label), by="label.x") %>%
  left_join(p2$data %>% dplyr::select(label, y) %>% dplyr::rename(label.y = label), by="label.y") %>%
  rownames_to_column("assoc") %>%
  rename_all(~str_replace(.x,pattern="\\.", replacement="_")) %>%
  pivot_longer(ends_with(c("_x", "_y")),
               names_to=c(".value", "tree"), 
               names_sep = "_"
              )  %>%
  mutate(signif_paco = replace_na(signif_paco, 0),
         signif_para = replace_na(signif_para, 0)) %>%
  mutate(signif = case_when(
    signif_paco==0 & signif_para==0  ~ "NS",
    signif_paco==0 & signif_para==1 ~ "para",
    signif_paco==1 & signif_para==0 ~ "paco",
    signif_paco==1 & signif_para==1 ~ "both"
  )) %>%
  mutate(tree = tree %>% 
           str_replace("x", "host")%>%
           str_replace("y", "microbe")) %>%
  filter(!is.na(label))%>% 
  group_by(tree) %>%
  mutate(y = y / max(y))%>%
  mutate(label = label %>% str_replace_all("_", " ")) 

gg.tangle <- ggplot(tangle, aes(x=factor(tree, levels=c("microbe", "host")), y=y, group=assoc, colour=as.factor(signif))) +
    geom_line(alpha=0.8) +  
  geom_text(data = tangle %>% 
              filter(tree=="host"),
            aes(label=label),stat = 'unique', hjust=0, check_overlap = TRUE)+
  geom_text(data = tangle %>% 
              filter(tree=="microbe"),
            aes(label=label),stat = 'unique', hjust=1, check_overlap = TRUE)+
  scale_colour_manual(values=c("NS"="steelblue", "paco"="darkorange1", "para"="#da2b91", "both"="#91da2b"), na.translate=FALSE)+
    scale_x_discrete(expand = expansion(add=c(0.8,0.8))) + 
    scale_y_continuous(expand=c(0.005,0.005))+
    theme_void() +
    theme(legend.position = "bottom") +
  labs(colour="Significance")
gg.powellia_tangle <- p2 + gg.tangle + (p1 + scale_x_reverse()) 
gg.powellia_tangle

pdf(file="figs/powellia_plant_tanglegram.pdf",  width = 8, height = 11, paper="a4")
  plot(gg.powellia_tangle)
try(dev.off(), silent=TRUE)
```


# Map of colleciton locations

```{r Collection map}
#Plot on map to confirm points
envData <- sample_data(ps2) %>%
  as_data_frame() %>%
  dplyr::select(Sample_Name,psyllid_spp, lat, long) %>%
  tibble::column_to_rownames("Sample_Name") %>%
  drop_na()

xlim <- c(165,180)
ylim <- c(-50,-30)

nz <- map(database= "world", region= "New Zealand", fill=TRUE, xlim=xlim,
  ylim=ylim) #, mar=c(0,0,0,0)

p1 <- ggtree(pruned.tree, ladderize=TRUE)
map_data <- p1$data%>%
  left_join(envData %>% dplyr::mutate(label =psyllid_spp ))


col <- colorRampPalette(brewer.pal(12, "Paired"))(length(unique(map_data$psyllid_spp)))

gg.nzmap <- ggplot(fortify(nz), aes(y=lat, x=long, group=group)) + 
  geom_polygon(fill="lightgrey", color="#7f7f7f") +
  geom_point(data=map_data, aes(x=long, y=lat, color=psyllid_spp), alpha=.5, size=3, inherit.aes = FALSE) + 
    geom_segment(data=map_data %>%
  mutate(y = scales::rescale(y, to = ylim )), aes(x=min(xlim), y=y, xend= long, yend=lat, color=psyllid_spp), alpha=.2, inherit.aes = FALSE) +
    theme_classic() +
    coord_fixed(ylim =ylim, xlim=xlim) +
    scale_x_continuous(expand = c(0,0)) + 
    scale_colour_manual(values=col) +
    theme(legend.position = "none") +
    scale_y_continuous(position = "right") +
  xlab("Longitude") + 
  ylab("Lattitude")

gg.nzmap 

#print(gg.nzmap, vp = viewport(width = .7, height = .7, angle = 35))
p1 <- p1 %<+% map_data +
  geom_tiplab(align=TRUE, aes(color=psyllid_spp), offset=0.1, hjust=1) +
  scale_x_continuous(expand=c(0, 0))+ 
  scale_colour_manual(values=col) 

gg.phylomap <- p1 + gg.nzmap

pdf(file="figs/phylomap.pdf",  width = 15, height = 15, paper="a4r")
  plot(gg.phylomap)
try(dev.off(), silent=TRUE)
```

# Sessioninfo

```{r sessioninfo, eval=TRUE}
devtools::session_info()
```

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see https://www.gnu.org/licenses/