Install & Load packages

#Set required packages
.cran_packages <- c("ggplot2", 
                    "gridExtra",
                    "tidyverse", 
                    "scales", 
                    "stringdist", 
                    "patchwork",
                    "seqinr",
                    "viridis", 
                    "ape", 
                    "data.table",
                    "RColorBrewer",
                    "vegan",
                    "geiger")
.bioc_packages <- c("dada2",
                    "phyloseq", 
                    "DECIPHER",
                    "Biostrings",
                    "ShortRead",
                    "Biostrings",
                    "ALDEx2")

.inst <- .cran_packages %in% installed.packages()
if(any(!.inst)) {
   install.packages(.cran_packages[!.inst])
}
.inst <- .bioc_packages %in% installed.packages()
if(any(!.inst)) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  BiocManager::install(.bioc_packages[!.inst], ask = F)
}

#Load all packages
sapply(c(.cran_packages,.bioc_packages), require, character.only = TRUE)
##      ggplot2    gridExtra    tidyverse       scales   stringdist    patchwork 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##       seqinr      viridis          ape   data.table RColorBrewer        vegan 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##       geiger        dada2     phyloseq     DECIPHER   Biostrings    ShortRead 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##   Biostrings       ALDEx2 
##         TRUE         TRUE
#Install and load github packages
devtools::install_github("alexpiper/seqateurs")
library(seqateurs)
devtools::install_github("tobiasgf/lulu")
library(lulu)
devtools::install_github("mikemc/speedyseq")
library(speedyseq)
devtools::install_github('ggloor/CoDaSeq/CoDaSeq')
library(CoDaSeq)

options(stringsAsFactors = FALSE)

Quality Control

Split files by seqrun

## R
samdf <- read.csv("sample_data/sample_info.csv")
run1 <- samdf %>%
  filter(seqrun==1) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run1,path="Run1.txt")

run2 <- samdf %>%
  filter(seqrun==2) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run2,path="Run2.txt")

run3 <- samdf %>%
  filter(seqrun==3) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run3,path="Run3.txt")
## BASH
mkdir run_1
cat Run1.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_1
done

mkdir run_2
cat Run2.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_2
done

mkdir run_3
cat Run3.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_3
done

Sequencer ID & fastq structure for each run

run_1 - @M01895:3:000000000-AFED3:1:1101:11856:1005 1:N:0:89
run_2 - @M00598:140:000000000-ALTW6:1:1104:15177:9425 2:N:0:120
run_3 - @M00933:9:000000000-BFR4B:1:1101:13542:1780 1:N:0:86

Sequence quality control

library(ShortRead)
runs <- dir("data/", pattern="run_")

for (i in seq(along=runs)){
path <- paste0("data/", runs[i])

#Plot number of reads
dat <- as.data.frame(countLines(dirPath=path, pattern=".fastq")) %>%
  rownames_to_column()  %>%
   `colnames<-`(c("Sample", "Reads")) %>%
  filter(str_detect(Sample,"R1"))

#Plot pooling
gg.pooling <- ggplot(data=dat, aes(x=Sample,y=Reads),stat="identity") + 
  geom_bar(aes(fill=Reads),stat="identity")  + 
  scale_fill_viridis(name = "Reads", begin=0.1) + 
  theme(axis.text.x = element_text(angle=90, hjust=1), plot.title=element_text(hjust = 0.5), plot.subtitle =element_text(hjust = 0.5))+ 
  geom_hline(aes(yintercept = mean(Reads)))  +
  xlab("sample name")+
  ylab("Number of reads") + 
  labs(title= paste0("Pooling for : ", runs[i]), subtitle = paste0("Total Reads: ", sum(dat$Reads), " Average reads: ",  sprintf("%.0f",mean(dat$Reads))," Standard deviation: ", sprintf("%.0f",sd(dat$Reads)))) +
  coord_flip()

plot(gg.pooling)
}

Trim primers

DADA2 requires Non-biological nucleotides i.e. primers, adapters, linkers, etc to be removed. Prior to begining this workflow, samples were demultiplexed and illumina adapters were removed by the MiSeq software, however primer sequences still remain in the reads and must be removed prior to use with the DADA2 algorithm.

Primer sequences:

16S F CCTACGGGNGGCWGCAG
16S R GACTACHVGGGTATCTAATCC

All reads were trimmed to 300bp, then primers were removed. All reads for which primers were not detected were removed using the maxlength function.

#Install bbmap
seqateurs::bbmap_install()

#Loop over runs - Maxlength set to remove any untrimmed reads
runs <- dir("data/", pattern="run_")

for (i in seq(along=runs)){
  path <- paste0("data/",runs[i])
  
  #Trim forward primers - Set a maxlength to remove all those that werent trimemd
  fastqFs <- sort(list.files(path, pattern="_R1", full.names = TRUE))
  fastqRs <- sort(list.files(path, pattern="_R2_", full.names = TRUE))
  
  bbtools_trim(install="bin/bbmap", fwd=fastqFs,rev=fastqRs, primers=c("CCTACGGGNGGCWGCAG","GACTACHVGGGTATCTAATCC"), copyundefined=TRUE, outpath="trimmed",ktrim="l", ordered=TRUE, mink=FALSE, hdist=2, overwrite=TRUE, samelength=TRUE, forcetrimright = 300, maxlength = 285)
}

Plot read quality & lengths

runs <- dir("data/", pattern="run_")
readcounts <- vector("list", length=length(runs))

for (i in seq(along=runs)){
 path <- paste0("data/",runs[i],"/trimmed" )

  filtFs <- sort(list.files(path, pattern="_R1_", full.names = TRUE))
  filtRs <- sort(list.files(path, pattern="_R2_", full.names = TRUE))
  p1 <- plotQualityProfile(filtFs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Forward Reads")) 
  p2 <- plotQualityProfile(filtRs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Reverse Reads"))
  
  #output plots
  dir.create("output/figures/")
  pdf(paste0("output/figures/",runs[i],"_prefilt_quality.pdf"), width = 11, height = 8 , paper="a4r")
  plot(p1+p2)
  dev.off()
  
  #Get lengths
  readcounts[[i]] <- cbind(width(readFastq(file.path(path, fastqFs))), width(readFastq(file.path(path, fastqRs))))
}

The max expected error function is used as the primary quality filter, and all reads containing N bases were removed

In order to reduce the amount of reverse reads violating the MaxEE filter, the reverse reads were truncated at 200 to remove the quality crash that is typical of illumina sequencers

Total amplicon = 465bp Sequencing = 2x300bp = 600bp Primers = 17bp + 21bp = 38bp Read overlap = 600 - 465 - 38 = 97bp

reverse should potentiall be reduced further - (from 230 to 200)

runs <- dir("data/", pattern="run_")
filtered_out <- vector("list", length=length(runs))
readlengths <- vector("list", length=length(runs))

for (i in 1:length(runs)){
  path <- paste0("data/",runs[i],"/trimmed/") 
  filtpath <- file.path(path, "filtered")
  dir.create(filtpath)
  fastqFs <- sort(list.files(path, pattern="R1_001.*"))
  fastqRs <- sort(list.files(path, pattern="R2_001.*"))
  
  if(length(fastqFs) != length(fastqRs)) stop(paste0("Forward and reverse files for ",runs[i]," do not match."))
  
  filtered_out[[i]] <- (filterAndTrim(fwd=file.path(path, fastqFs), filt=file.path(filtpath, fastqFs),
                                      rev=file.path(path, fastqRs), filt.rev=file.path(filtpath, fastqRs),
                                      maxEE=c(2,3),truncQ = 0,truncLen=c(280,200), maxN = 0,  rm.phix=TRUE, compress=TRUE, verbose=TRUE))
  
  # post filtering plot
  filtFs <- sort(list.files(filtpath, pattern="R1_001.*", full.names = TRUE))
  filtRs <- sort(list.files(filtpath, pattern="R2_001.*", full.names = TRUE))
  p1 <- plotQualityProfile(filtFs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Forward Reads")) 
  p2 <- plotQualityProfile(filtRs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Reverse Reads"))
  
  #output plots
  dir.create("output/figures/")
  pdf(paste0("output/figures/",runs[i],"_postfilt_quality.pdf"), width = 11, height = 8 , paper="a4r")
  plot(p1+p2)
  dev.off()
  
  #Get lengths post filter
  readlengths[[i]] <- cbind(width(readFastq(file.path(filtFs))), width(readFastq(file.path(filtRs))))
}
print(filtered_out)

Sequence processing

Infer sequence variants for each run

runs <- dir("data/", pattern="run_")
set.seed(100)

for (i in seq(along=runs)){
 path <- paste0("data/",runs[i],"/trimmed/" )
  filtpath <- file.path(path, "filtered")
  
  filtFs <- list.files(filtpath, pattern="R1_001.*", full.names = TRUE)
  filtRs <- list.files(filtpath, pattern="R2_001.*", full.names = TRUE)
  
  # Learn error rates from samples
  errF <- learnErrors(filtFs, multithread=TRUE, randomize=TRUE)
  errR <- learnErrors(filtRs, multithread=TRUE, randomize=TRUE)
  
  ##Print error plots to see how well the algorithm modelled the errors in the different runs
  print(plotErrors(errF, nominalQ=TRUE)+ ggtitle(paste0(runs[i]," Forward Reads")))
  print(plotErrors(errR, nominalQ=TRUE)+ ggtitle(paste0(runs[i]," Reverse Reads")))
  
  #Error inference and merger of reads - Using pseudo pooling for increased sensitivity
  dadaFs <- dada(filtFs, err=errF, multithread=TRUE, pool="pseudo")
  dadaRs <- dada(filtRs, err=errR, multithread=TRUE, pool="pseudo")
  mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE, minOverlap = 20, trimOverhang = TRUE )
  
  # Construct sequence table
  seqtab <- makeSequenceTable(mergers)
 saveRDS(seqtab, paste0("output/rds/", runs[i], "_seqtab.rds"))
}

Merge Runs, Remove Chimeras

Now that the sequence tables are created for each run, they need to be merged into a larger table representing the entire study. Looking at the length of the sequences, we see some off target amplification. There are 3 large peaks, the first one at 280, second at 402bp, and third at 427. The 427bp peak contains the majority of the sequences and is the expected size, while the 402bp peak contains wolbachia which have a 24bp deletion. We will cut the sequences to between 400 and 450, which should take into account any length variation, and remove the 280bp peak (which is the length of the forward read and most likely artefactual). Following this, chimeric sequences are identified and removed using removeBimeraDenovo

set.seed(606)
seqtabs <- list.files("output/rds/", pattern="seqtab.rds", full.names = TRUE)
st.all <- mergeSequenceTables(tables=seqtabs)

#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(st.all, method="consensus", multithread=TRUE, verbose=TRUE)

#Check output of chimera removal
print(paste(sum(seqtab.nochim)/sum(st.all),"of the abundance remaining after chimera removal"))

#Check complexity
hist(seqComplexity(seqtab.nochim), 100)

#Look at seqlengths
plot(table(nchar(getSequences(seqtab.nochim))))

#cut to expected size
#seqtab.nochim <- seqtab.nochim[,nchar(colnames(seqtab.nochim)) %in% 400:435]
#seqtab_cut <- seqtab.nochim[,nchar(colnames(seqtab.nochim)) %in% 400:435]
# Count hpw many this removed

print(paste(sum(seqtab.nochim)/sum(st.all),"of the abundance remaining after cutting"))

#Fix names -removing read name, sample number etcc
rownames(seqtab.nochim) <- rownames(seqtab.nochim) %>% 
  str_split_fixed("_",n=Inf) %>%
    as_tibble() %>%
  unite(col=SampleID, c("V1","V2"),sep="_") %>%
  pull(SampleID)

dir.create("output/rds/")
saveRDS(seqtab.nochim, "output/rds/seqtab_nocut.rds")

# summarise cleanup
cleanup <- st.all %>%
  as.data.frame() %>%
  pivot_longer( everything(),
    names_to = "OTU",
    values_to = "Abundance") %>%
  group_by(OTU) %>%
  summarise(Abundance = sum(Abundance)) %>%
  mutate(length  = nchar(OTU)) %>%
  mutate(type = case_when(
    !OTU %in% getSequences(seqtab.nochim) ~ "Chimera",
    TRUE ~ "Real"
  )) 

# Output length distribution plots
gg.abundance <- ggplot(cleanup, aes(x=length, y=Abundance, fill=type))+
              geom_bar(stat="identity") + 
              ggtitle("Abundance of sequences") +
              geom_vline(xintercept = 400, colour="red") +
              geom_vline(xintercept = 435, colour="red") 

gg.unique <- ggplot(cleanup, aes(x=length, fill=type))+
            geom_histogram(binwidth = 1) + 
            ggtitle("Number of unique sequences")+
              geom_vline(xintercept = 400, colour="red") +
              geom_vline(xintercept = 435, colour="red") 

plot(gg.abundance / gg.unique)

pdf(paste0("output/logs/seqtab_length_dist.pdf"), width = 11, height = 8 , paper="a4r")
  plot(gg.abundance / gg.unique)
try(dev.off(), silent=TRUE)

Assign taxonomy with IDTAXA

We will use the IDTAXA algorithm of Murali et al 2018 - https://doi.org/10.1186/s40168-018-0521-5 Folllowing phylum to genus assignment with IDTAXA, we will also use exact matching with a reference database to assign to species level.

#seqtab_final <- readRDS("output/rds/seqtab_final.rds")
seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")

# Load trainingset
load("reference/SILVA_SSU_r138_2019.RData")

# Create a DNAStringSet from the ASVs
dna <- DNAStringSet(getSequences(seqtab_final)) 

# Assign taxonomy
library(DECIPHER)
ids <- IdTaxa(dna, trainingSet, processors=8, threshold = 60, verbose=TRUE)  

saveRDS(ids, "ids_nocut.rds")
# Output plot of ids
pdf(paste0("figs/idtaxa.pdf"), width = 11, height = 8 , paper="a4r")
  plot(ids)
try(dev.off(), silent=TRUE)

#Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
tax <- t(sapply(ids, function(x) {
    taxa <- paste0(x$taxon,"_", x$confidence)
    taxa[startsWith(taxa, "unclassified_")] <- NA
    taxa
  })) %>%
  purrr::map(unlist) %>%
  stringi::stri_list2matrix(byrow=TRUE, fill=NA) %>%
  magrittr::set_colnames(c("root", "domain", "phylum", "class", "order", "family", "genus")) %>%
  magrittr::set_rownames(getSequences(seqtab_final)) %>%
  as.data.frame()  %>%
  mutate_all(str_replace,pattern="(?:.(?!_))+$", replacement="") %>%
  #seqateurs::na_to_unclassified() %>% #Propagate high order ranks to unassigned ASV'
  magrittr::set_rownames(getSequences(seqtab_final)) %>%
  as.matrix()

# Write taxonomy table to disk
saveRDS(tax, "output/rds/tax_IdTaxa_nocut.rds") 

#Add species using exact matching
exact <- assignSpecies(seqtab_final, "reference/silva_species_assignment_v138.fa.gz", allowMultiple =TRUE, tryRC = TRUE, verbose = FALSE)

exact <- exact %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  rownames_to_column("otu") %>%
  mutate(species =  case_when(!is.na(Species) ~  paste0(Genus,"_",Species)))

tax_exact <- tax %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  rownames_to_column("otu") %>%
  left_join(exact %>% select(otu, species), by="otu") %>%
  magrittr::set_rownames(.$otu) %>%
  select(-otu) %>%
  na_to_unclassified() %>% #Propagate high order ranks to unassigned ASV'
  as.matrix()

saveRDS(tax_exact, "output/rds/tax_IdTaxaExact_nocut.rds") 

Compare assignment to blast top hit

seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")
tax <- readRDS("output/rds/tax_IdTaxaExact_nocut.rds")

seqs <- insect::char2dna(colnames(seqtab_final))
names(seqs) <- colnames(seqtab_final)

out <- blast_top_hit(query=seqs, db="reference/silva_species_assignment_v138.fa.gz", threshold=60 )
saveRDS(out, "output/rds/blast_top_hit.rds")

# Blast counts only the aligned subsections when calculating % identity
# Therefore we need to set a minimum and maximum alignment length allowed
out <- readRDS("output/rds/blast_top_hit.rds")
hist(out$length)
allowed_lengths <- 220:500

joint <- out %>% 
  dplyr::select(OTU = qseqid, acc, blastspp = Species, pident, length, evalue) %>%
  left_join(tax %>% 
              seqateurs::unclassified_to_na(rownames=FALSE) %>%
              mutate(lowest = seqateurs::lowest_classified(.)), by="OTU") %>%
  dplyr::filter(length %in% allowed_lengths) 

#Write out comparison between BLAST and Heiarchial assignment
write_csv(joint, "output/csv/tax_assignment_comparison.csv")

gg.tophit <- joint %>%
  dplyr::select(pident, rank = lowest) %>%
  mutate(rank = factor(rank, levels = c("root", "domain", "phylum", "class", "order", "family", "genus", "species"))) %>%
  ggplot(aes(x=pident, fill=rank))+ 
  geom_histogram(colour="black", binwidth = 1, position = "stack") + 
  labs(title = "Top hit identity distribution",
       x = "BLAST top hit % identity",
       y = "OTUs") + 
  scale_x_continuous(breaks=seq(60,100,2)) +
  scale_fill_brewer(name = "Taxonomic \nAssignment", palette = "Spectral")

gg.tophit

pdf(paste0("output/figs/top_hit_tax_assignment.pdf"), width = 11, height = 8 , paper="a4r")
  gg.tophit
try(dev.off(), silent=TRUE)

Make Phyloseq object

Following taxonomic assignment, the sequence table and taxonomic table are merged into a single phyloseq object alongside the sample info csv.

# Load data
seqtab <- t(readRDS("output/rds/seqtab_nocut.rds"))
#seqtab <- readRDS("output/rds/seqtab_curated.rds")
tax <- readRDS("output/rds/tax_IdTaxaExact_nocut.rds")

seqs <- DNAStringSet(rownames(seqtab))
names(seqs) <- seqs
samdf <- read_csv("sample_data/Sample_info3.csv") %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  magrittr::set_rownames(.$SampleID) 

# Make phyloseq object
ps <- phyloseq(tax_table(tax),
               sample_data(samdf),
               otu_table(t(seqtab), taxa_are_rows = FALSE),
               refseq(seqs))

if(nrow(seqtab) > nrow(sample_data(ps))){warning("Warning: All samples not included in phyloseq object, check sample names match the sample metadata")}

#Rename taxa
#taxa_names(ps) <- paste0("SV", seq(ntaxa(ps)),"-",tax_table(ps)[,8])

##save phyloseq object
saveRDS(ps, "output/rds/ps.rds")

#Output tables of results
dir.create("output/csv")
dir.create("output/otu_tables/unfiltered/", recursive = TRUE)

##Export raw csv
speedyseq::psmelt(ps) %>%
  dplyr::filter(Abundance > 0) %>%
  write.csv(file = "output/otu_tables/rawdata.csv")

# Export species level summary
seqateurs::summarise_taxa(ps, "species", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/unfiltered/spp_sum.csv")

# Export genus level summary
seqateurs::summarise_taxa(ps, "genus", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/unfiltered/gen_sum.csv")

#Check carsonella presence
cars <- speedyseq::psmelt(ps) %>%
  filter(Abundance > 0) %>%
  group_by(psyllid_spp) %>%
  summarise(n = count(genus=="Candidatus Carsonella", na.rm = TRUE))

test <- speedyseq::psmelt(ps) %>%
  filter(Abundance > 0) %>%
  filter(genus=="Candidatus Carsonella")

Taxon Filtering

Remove all non-bacterial taxa, mitochondria, chloroplast and cyanobacteria. We also remove all taxa contained within the blank sample from other samples? - Check these

get_taxa_unique(ps, "domain")
ntaxa(ps) # Check the number of taxa prior to removal
ps0 <- ps %>%
  subset_taxa(
    domain == "Bacteria" & 
    family  != "Mitochondria" &
    order   != "Chloroplast" &
    phylum != "Cyanobacteria"
  )
#Check taxa were removed
ntaxa(ps0)
get_taxa_unique(ps0, "phylum")
get_taxa_unique(ps0, "class")
get_taxa_unique(ps0, "family")

Remove outlier Samples

Detecting and potentially removing samples outliers (those samples with underlying data that do not conform to experimental or biological expectations) can be useful for minimizing technical variance. This can be caused by a number of reasons, including low-reads assigned to that sample. In this case we remove all samples below 1000 reads, as these include all samples contributing to lower than usual ASV counts.

Rarefaction curves are useful to assess sensitivity of sample size to observed alpha-diversity estimates.

## Remove mocks
rm_mocks <- c("mockA_S51", "MockEven_S193", "Mock_S192", "PCRctrl_S191", "MockStaggered_S194", "PCRctrl_S191", "91_S167")

#check mocks
ps1 <- ps0 %>% 
  subset_samples(!sample_names(ps0) %in% rm_mocks) %>% #Remove mocks
  filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table 

message((nsamples(ps0) - nsamples(ps1)), " outlier samples dropped")

#Plot rarefaction curve
out <- rarecurve(otu_table(ps1), step=100)

rare <- lapply(out, function(x){
  b <- as.data.frame(x)
  b <- data.frame(OTU = b[,1], count = rownames(b))
  b$count <- as.numeric(gsub("N", "",  b$count))
  return(b)
})
names(rare) <- sample_names(ps1)

rare <- map_dfr(rare, function(x){
  z <- data.frame(x)
  return(z)
}, .id = "sample")

# read threshold for sample removal
threshold = 1000

gg.rare <- ggplot(data = rare)+
  geom_line(aes(x = count, y = OTU, group=sample), alpha=0.5)+
  geom_point(data = rare %>% 
               group_by(sample) %>% 
               top_n(1, count),
             aes(x = count, y = OTU, colour=(count > threshold))) +
  scale_x_continuous(labels =  scales::label_number_si()) +
  geom_vline(xintercept=threshold, linetype="dashed") +
  labs(colour = "Sample kept?") +
  xlab("Sequence reads") +
  ylab("Observed ASV's")

#Write out figure
pdf(file="figs/rarefaction.pdf", width = 11, height = 8 , paper="a4r")
  plot(gg.rare)
try(dev.off(), silent=TRUE)

#Remove all samples under the minimum read threshold 
ps2 <- prune_samples(sample_sums(ps1)>=threshold, ps1) 
ps2 <- filter_taxa(ps2, function(x) mean(x) > 0, TRUE) #Drop missing taxa from table
message(nsamples(ps1) - nsamples(ps2), " Samples and ", ntaxa(ps1) - ntaxa(ps2), " taxa under read threshold Dropped")

Compare distances pre and post filtering

seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")
#Get pre_filt distances
pre_filt <- as.matrix(zCompositions::cmultRepl(seqtab_final, method="BL", output="p-counts"))
pre_filt_dist <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(pre_filt), method="euclidean"))

# Get post_filt distances
otutab <- otu_table(ps2) %>%
  as("matrix")

#otutab <- t(seqtab)
post_filt <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))
post_filt_dist <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(post_filt), method="euclidean"))

#Subset to only common samples
subsample <- intersect(colnames(pre_filt_dist), colnames(post_filt_dist))
as.data.frame(vegan::mantel(pre_filt_dist[subsample, subsample], post_filt_dist[subsample, subsample])[c("statistic","signif","permutations")])

Output fastas

#Write out fasta and align with silva online
seqs <- refseq(ps2)
writeXStringSet(seqs, "output/curated_asv.fasta", width=1000)

Align with SINA

#Create a virtual environment
module load Python/3.8.2-GCCcore-9.3.0 
virtualenv ~/SINA
source ~/SINA/bin/activate

#Install sina
wget https://github.com/epruesse/SINA/releases/download/v1.7.0/sina-1.7.0-linux.tar.gz
tar xf sina-1.7.0-linux.tar.gz
cd sina-1.7.0-linux

#Get referenceDB
wget https://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_NR99_05_01_20_opt.arb.gz


#Run sina
~/sina-1.7.0-linux/sina --search --add-relatives=5 --meta-fmt=header --fields=acc \
-i /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/curated_asv.fasta \
-r /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/reference/SILVA_138_SSURef_NR99_05_01_20_opt.arb \
-o /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/aligned_asv.fasta

Create phylogenetic tree with fasttreee

module load FastTree
FastTree -gtr -cat 20 -quote -nt /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/aligned_asv.fasta > /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/sina_tree 

Date tree with PATHd8

# Date usign congruify
#tree <- read.tree("arb-silva.de_2020-07-28_id860849/arb-silva.de_2020-07-28_id860849.tree")
tree <- read.tree("output/sina_tree")
tree$tip.label <- tree$tip.label %>%
  str_split_fixed("\\[", n=Inf) %>%
  as.data.frame() %>%
  unite(V1, V2, col="name") %>%
  mutate(name = name %>%
           str_remove("_align.*$") %>%
           str_remove("^.*_acc=") %>%
           str_remove("\\]$")) %>%
  pull(name)

tree_pruned <- drop.tip(tree, tree$tip.label[duplicated(tree$tip.label)])

#Reference tree
ref_tree <- read.tree("bin/Dated trees/Bacteria_16S_SILVA_97sim_FastTree_PATHd8.tre")

ref_tree$tip.label <- ref_tree$tip.label %>% 
  str_extract("^(.*?)\\.") %>%
  str_remove("\\.$")

#2881  congruent tips
table(tree$tip.label %in% ref_tree$tip.label)

ref_tree_pruned <- drop.tip(ref_tree, c(ref_tree$tip.label[!ref_tree$tip.label %in% tree$tip.label],ref_tree$tip.label[duplicated(ref_tree$tip.label)]))

table(tree_pruned$tip.label %in% ref_tree_pruned$tip.label)

res <- congruify.phylo(reference=ref_tree_pruned, target=tree_pruned, scale="PATHd8")

tree2 <- drop.tip(res$phy, res$phy$tip.label[!res$phy$tip.label %in% names(refseq(ps2))])
write.tree(tree2, "output/phytree.nwk")

Merge technical replicates

With only 16 samples replicated, and only on the first 2 sequencing runs i dont think there is a way to explicitly take into account replicate variability, therefore all replicates were merged

This can be done with DADA2 - mergeSequenceTables(st1, st2, st3, repeats = “sum”

# Merge replicates
ps.merged <- ps2 %>%
    merge_samples(group = "Sample_Name", fun="sum")


#This loses the sample metadata - Need to add it agian
sample_data(ps.merged) <- sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  filter(!duplicated(Sample_Name)) %>%
  magrittr::set_rownames(.$Sample_Name)

seqs <- refseq(ps2)
tree <- read.tree("output/phytree.nwk")

#make new phyloseq object
ps2 <- phyloseq(tax_table(ps.merged),
               sample_data(ps.merged),
               otu_table(otu_table(ps.merged), taxa_are_rows = FALSE),
               refseq(seqs),
               phy_tree(tree))


saveRDS(ps2, "output/rds/ps2.rds")
---
title: "Psyllid microbiome"
subtitle: "Bioinformatic sequence processing"
author: "Alexander Piper"
date: "`r Sys.Date()`"
output:
  html_document:
    includes:
      after_body: footer.html
    highlighter: null
    theme: "flatly"
    code_download: true
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: true
    df_print: paged
  pdf_document: default
editor_options: 
  chunk_output_type: console
---


```{r setup, include=FALSE}
# Knitr global setup - change eval to true to run code
library(knitr)
knitr::opts_chunk$set(echo = TRUE, eval=FALSE, message=FALSE,error=FALSE,fig.show = "hold", fig.keep = "all")
opts_chunk$set(dev = 'png')
```

# Install & Load packages 

```{r Load packages, eval=TRUE, warning=FALSE, message=FALSE, error=FALSE} 
#Set required packages
.cran_packages <- c("ggplot2", 
                    "gridExtra",
                    "tidyverse", 
                    "scales", 
                    "stringdist", 
                    "patchwork",
                    "seqinr",
                    "viridis", 
                    "ape", 
                    "data.table",
                    "RColorBrewer",
                    "vegan",
                    "geiger")
.bioc_packages <- c("dada2",
                    "phyloseq", 
                    "DECIPHER",
                    "Biostrings",
                    "ShortRead",
                    "Biostrings",
                    "ALDEx2")

.inst <- .cran_packages %in% installed.packages()
if(any(!.inst)) {
   install.packages(.cran_packages[!.inst])
}
.inst <- .bioc_packages %in% installed.packages()
if(any(!.inst)) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  BiocManager::install(.bioc_packages[!.inst], ask = F)
}

#Load all packages
sapply(c(.cran_packages,.bioc_packages), require, character.only = TRUE)

#Install and load github packages
devtools::install_github("alexpiper/seqateurs")
library(seqateurs)
devtools::install_github("tobiasgf/lulu")
library(lulu)
devtools::install_github("mikemc/speedyseq")
library(speedyseq)
devtools::install_github('ggloor/CoDaSeq/CoDaSeq')
library(CoDaSeq)

options(stringsAsFactors = FALSE)
```

# Quality Control

## Split files by seqrun
```{r Split files}
## R
samdf <- read.csv("sample_data/sample_info.csv")
run1 <- samdf %>%
  filter(seqrun==1) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run1,path="Run1.txt")

run2 <- samdf %>%
  filter(seqrun==2) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run2,path="Run2.txt")

run3 <- samdf %>%
  filter(seqrun==3) %>%
  pull(SampleID) %>%
  as.character()
write_lines(run3,path="Run3.txt")
```


```{bash split files}
## BASH
mkdir run_1
cat Run1.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_1
done

mkdir run_2
cat Run2.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_2
done

mkdir run_3
cat Run3.txt | while read i; do
payload=$(ls | grep $i)
echo $payload
mv $payload run_3
done

```

Sequencer ID & fastq structure for each run

    run_1 - @M01895:3:000000000-AFED3:1:1101:11856:1005 1:N:0:89
    run_2 - @M00598:140:000000000-ALTW6:1:1104:15177:9425 2:N:0:120
    run_3 - @M00933:9:000000000-BFR4B:1:1101:13542:1780 1:N:0:86

## Sequence quality control
```{r QC}
library(ShortRead)
runs <- dir("data/", pattern="run_")

for (i in seq(along=runs)){
path <- paste0("data/", runs[i])

#Plot number of reads
dat <- as.data.frame(countLines(dirPath=path, pattern=".fastq")) %>%
  rownames_to_column()  %>%
   `colnames<-`(c("Sample", "Reads")) %>%
  filter(str_detect(Sample,"R1"))

#Plot pooling
gg.pooling <- ggplot(data=dat, aes(x=Sample,y=Reads),stat="identity") + 
  geom_bar(aes(fill=Reads),stat="identity")  + 
  scale_fill_viridis(name = "Reads", begin=0.1) + 
  theme(axis.text.x = element_text(angle=90, hjust=1), plot.title=element_text(hjust = 0.5), plot.subtitle =element_text(hjust = 0.5))+ 
  geom_hline(aes(yintercept = mean(Reads)))  +
  xlab("sample name")+
  ylab("Number of reads") + 
  labs(title= paste0("Pooling for : ", runs[i]), subtitle = paste0("Total Reads: ", sum(dat$Reads), " Average reads: ",  sprintf("%.0f",mean(dat$Reads))," Standard deviation: ", sprintf("%.0f",sd(dat$Reads)))) +
  coord_flip()

plot(gg.pooling)
}
```

## Trim primers

DADA2 requires Non-biological nucleotides i.e. primers, adapters, linkers, etc to be removed. Prior to begining this workflow, samples were demultiplexed and illumina adapters were removed by the MiSeq software, however primer sequences still remain in the reads and must be removed prior to use with the DADA2 algorithm.

Primer sequences:

    16S F CCTACGGGNGGCWGCAG
    16S R GACTACHVGGGTATCTAATCC

All reads were trimmed to 300bp, then primers were removed. All reads for which primers were not detected were removed using the maxlength function.

```{r trim primers,message=FALSE}
#Install bbmap
seqateurs::bbmap_install()

#Loop over runs - Maxlength set to remove any untrimmed reads
runs <- dir("data/", pattern="run_")

for (i in seq(along=runs)){
  path <- paste0("data/",runs[i])
  
  #Trim forward primers - Set a maxlength to remove all those that werent trimemd
  fastqFs <- sort(list.files(path, pattern="_R1", full.names = TRUE))
  fastqRs <- sort(list.files(path, pattern="_R2_", full.names = TRUE))
  
  bbtools_trim(install="bin/bbmap", fwd=fastqFs,rev=fastqRs, primers=c("CCTACGGGNGGCWGCAG","GACTACHVGGGTATCTAATCC"), copyundefined=TRUE, outpath="trimmed",ktrim="l", ordered=TRUE, mink=FALSE, hdist=2, overwrite=TRUE, samelength=TRUE, forcetrimright = 300, maxlength = 285)
}
```

## Plot read quality & lengths

```{r QA plot}
runs <- dir("data/", pattern="run_")
readcounts <- vector("list", length=length(runs))

for (i in seq(along=runs)){
 path <- paste0("data/",runs[i],"/trimmed" )

  filtFs <- sort(list.files(path, pattern="_R1_", full.names = TRUE))
  filtRs <- sort(list.files(path, pattern="_R2_", full.names = TRUE))
  p1 <- plotQualityProfile(filtFs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Forward Reads")) 
  p2 <- plotQualityProfile(filtRs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Reverse Reads"))
  
  #output plots
  dir.create("output/figures/")
  pdf(paste0("output/figures/",runs[i],"_prefilt_quality.pdf"), width = 11, height = 8 , paper="a4r")
  plot(p1+p2)
  dev.off()
  
  #Get lengths
  readcounts[[i]] <- cbind(width(readFastq(file.path(path, fastqFs))), width(readFastq(file.path(path, fastqRs))))
}
```

The max expected error function is used as the primary quality filter, and all reads containing N bases were removed

In order to reduce the amount of  reverse reads violating the MaxEE filter, the reverse reads were truncated at 200 to remove the quality crash that is typical of illumina sequencers

Total amplicon = 465bp 
Sequencing = 2x300bp = 600bp
Primers = 17bp + 21bp = 38bp
Read overlap = 600 - 465 - 38 = 97bp

reverse should potentiall be reduced further - (from 230 to 200)

```{r filter and trim}
runs <- dir("data/", pattern="run_")
filtered_out <- vector("list", length=length(runs))
readlengths <- vector("list", length=length(runs))

for (i in 1:length(runs)){
  path <- paste0("data/",runs[i],"/trimmed/") 
  filtpath <- file.path(path, "filtered")
  dir.create(filtpath)
  fastqFs <- sort(list.files(path, pattern="R1_001.*"))
  fastqRs <- sort(list.files(path, pattern="R2_001.*"))
  
  if(length(fastqFs) != length(fastqRs)) stop(paste0("Forward and reverse files for ",runs[i]," do not match."))
  
  filtered_out[[i]] <- (filterAndTrim(fwd=file.path(path, fastqFs), filt=file.path(filtpath, fastqFs),
                                      rev=file.path(path, fastqRs), filt.rev=file.path(filtpath, fastqRs),
                                      maxEE=c(2,3),truncQ = 0,truncLen=c(280,200), maxN = 0,  rm.phix=TRUE, compress=TRUE, verbose=TRUE))
  
  # post filtering plot
  filtFs <- sort(list.files(filtpath, pattern="R1_001.*", full.names = TRUE))
  filtRs <- sort(list.files(filtpath, pattern="R2_001.*", full.names = TRUE))
  p1 <- plotQualityProfile(filtFs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Forward Reads")) 
  p2 <- plotQualityProfile(filtRs, aggregate = TRUE) + ggtitle(paste0(runs[i]," Reverse Reads"))
  
  #output plots
  dir.create("output/figures/")
  pdf(paste0("output/figures/",runs[i],"_postfilt_quality.pdf"), width = 11, height = 8 , paper="a4r")
  plot(p1+p2)
  dev.off()
  
  #Get lengths post filter
  readlengths[[i]] <- cbind(width(readFastq(file.path(filtFs))), width(readFastq(file.path(filtRs))))
}
print(filtered_out)
```

# Sequence processing

## Infer sequence variants for each run

```{r Learn error rates }
runs <- dir("data/", pattern="run_")
set.seed(100)

for (i in seq(along=runs)){
 path <- paste0("data/",runs[i],"/trimmed/" )
  filtpath <- file.path(path, "filtered")
  
  filtFs <- list.files(filtpath, pattern="R1_001.*", full.names = TRUE)
  filtRs <- list.files(filtpath, pattern="R2_001.*", full.names = TRUE)
  
  # Learn error rates from samples
  errF <- learnErrors(filtFs, multithread=TRUE, randomize=TRUE)
  errR <- learnErrors(filtRs, multithread=TRUE, randomize=TRUE)
  
  ##Print error plots to see how well the algorithm modelled the errors in the different runs
  print(plotErrors(errF, nominalQ=TRUE)+ ggtitle(paste0(runs[i]," Forward Reads")))
  print(plotErrors(errR, nominalQ=TRUE)+ ggtitle(paste0(runs[i]," Reverse Reads")))
  
  #Error inference and merger of reads - Using pseudo pooling for increased sensitivity
  dadaFs <- dada(filtFs, err=errF, multithread=TRUE, pool="pseudo")
  dadaRs <- dada(filtRs, err=errR, multithread=TRUE, pool="pseudo")
  mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE, minOverlap = 20, trimOverhang = TRUE )
  
  # Construct sequence table
  seqtab <- makeSequenceTable(mergers)
 saveRDS(seqtab, paste0("output/rds/", runs[i], "_seqtab.rds"))
}
```

## Merge Runs, Remove Chimeras

Now that the sequence tables are created for each run, they need to be merged into a larger table representing the entire study. 
Looking at the length of the sequences, we see some off target amplification. There are 3 large peaks, the first one at 280, second at 402bp, and third at 427. The 427bp peak contains the majority of the sequences and is the expected size, while the 402bp peak contains wolbachia which have a 24bp deletion. We will cut the sequences to between 400 and 450, which should take into account any length variation, and remove the 280bp peak (which is the length of the forward read and most likely artefactual). Following this, chimeric sequences are identified and removed using removeBimeraDenovo

```{r merge runs and remove chimeras}
set.seed(606)
seqtabs <- list.files("output/rds/", pattern="seqtab.rds", full.names = TRUE)
st.all <- mergeSequenceTables(tables=seqtabs)

#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(st.all, method="consensus", multithread=TRUE, verbose=TRUE)

#Check output of chimera removal
print(paste(sum(seqtab.nochim)/sum(st.all),"of the abundance remaining after chimera removal"))

#Check complexity
hist(seqComplexity(seqtab.nochim), 100)

#Look at seqlengths
plot(table(nchar(getSequences(seqtab.nochim))))

#cut to expected size
#seqtab.nochim <- seqtab.nochim[,nchar(colnames(seqtab.nochim)) %in% 400:435]
#seqtab_cut <- seqtab.nochim[,nchar(colnames(seqtab.nochim)) %in% 400:435]
# Count hpw many this removed

print(paste(sum(seqtab.nochim)/sum(st.all),"of the abundance remaining after cutting"))

#Fix names -removing read name, sample number etcc
rownames(seqtab.nochim) <- rownames(seqtab.nochim) %>% 
  str_split_fixed("_",n=Inf) %>%
    as_tibble() %>%
  unite(col=SampleID, c("V1","V2"),sep="_") %>%
  pull(SampleID)

dir.create("output/rds/")
saveRDS(seqtab.nochim, "output/rds/seqtab_nocut.rds")

# summarise cleanup
cleanup <- st.all %>%
  as.data.frame() %>%
  pivot_longer( everything(),
    names_to = "OTU",
    values_to = "Abundance") %>%
  group_by(OTU) %>%
  summarise(Abundance = sum(Abundance)) %>%
  mutate(length  = nchar(OTU)) %>%
  mutate(type = case_when(
    !OTU %in% getSequences(seqtab.nochim) ~ "Chimera",
    TRUE ~ "Real"
  )) 

# Output length distribution plots
gg.abundance <- ggplot(cleanup, aes(x=length, y=Abundance, fill=type))+
              geom_bar(stat="identity") + 
              ggtitle("Abundance of sequences") +
              geom_vline(xintercept = 400, colour="red") +
              geom_vline(xintercept = 435, colour="red") 

gg.unique <- ggplot(cleanup, aes(x=length, fill=type))+
            geom_histogram(binwidth = 1) + 
            ggtitle("Number of unique sequences")+
              geom_vline(xintercept = 400, colour="red") +
              geom_vline(xintercept = 435, colour="red") 

plot(gg.abundance / gg.unique)

pdf(paste0("output/logs/seqtab_length_dist.pdf"), width = 11, height = 8 , paper="a4r")
  plot(gg.abundance / gg.unique)
try(dev.off(), silent=TRUE)
```

# Assign taxonomy with IDTAXA

We will use the IDTAXA algorithm of Murali et al 2018 - https://doi.org/10.1186/s40168-018-0521-5 Folllowing phylum to genus assignment with IDTAXA, we will also use exact matching with a reference database to assign to species level.

```{r IDTAXA}
#seqtab_final <- readRDS("output/rds/seqtab_final.rds")
seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")

# Load trainingset
load("reference/SILVA_SSU_r138_2019.RData")

# Create a DNAStringSet from the ASVs
dna <- DNAStringSet(getSequences(seqtab_final)) 

# Assign taxonomy
library(DECIPHER)
ids <- IdTaxa(dna, trainingSet, processors=8, threshold = 60, verbose=TRUE)  

saveRDS(ids, "ids_nocut.rds")
# Output plot of ids
pdf(paste0("figs/idtaxa.pdf"), width = 11, height = 8 , paper="a4r")
  plot(ids)
try(dev.off(), silent=TRUE)

#Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
tax <- t(sapply(ids, function(x) {
    taxa <- paste0(x$taxon,"_", x$confidence)
    taxa[startsWith(taxa, "unclassified_")] <- NA
    taxa
  })) %>%
  purrr::map(unlist) %>%
  stringi::stri_list2matrix(byrow=TRUE, fill=NA) %>%
  magrittr::set_colnames(c("root", "domain", "phylum", "class", "order", "family", "genus")) %>%
  magrittr::set_rownames(getSequences(seqtab_final)) %>%
  as.data.frame()  %>%
  mutate_all(str_replace,pattern="(?:.(?!_))+$", replacement="") %>%
  #seqateurs::na_to_unclassified() %>% #Propagate high order ranks to unassigned ASV'
  magrittr::set_rownames(getSequences(seqtab_final)) %>%
  as.matrix()

# Write taxonomy table to disk
saveRDS(tax, "output/rds/tax_IdTaxa_nocut.rds") 

#Add species using exact matching
exact <- assignSpecies(seqtab_final, "reference/silva_species_assignment_v138.fa.gz", allowMultiple =TRUE, tryRC = TRUE, verbose = FALSE)

exact <- exact %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  rownames_to_column("otu") %>%
  mutate(species =  case_when(!is.na(Species) ~  paste0(Genus,"_",Species)))

tax_exact <- tax %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  rownames_to_column("otu") %>%
  left_join(exact %>% select(otu, species), by="otu") %>%
  magrittr::set_rownames(.$otu) %>%
  select(-otu) %>%
  na_to_unclassified() %>% #Propagate high order ranks to unassigned ASV'
  as.matrix()

saveRDS(tax_exact, "output/rds/tax_IdTaxaExact_nocut.rds") 
```


# Compare assignment to blast top hit
```{r Compare to blast top hit}
seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")
tax <- readRDS("output/rds/tax_IdTaxaExact_nocut.rds")

seqs <- insect::char2dna(colnames(seqtab_final))
names(seqs) <- colnames(seqtab_final)

out <- blast_top_hit(query=seqs, db="reference/silva_species_assignment_v138.fa.gz", threshold=60 )
saveRDS(out, "output/rds/blast_top_hit.rds")

# Blast counts only the aligned subsections when calculating % identity
# Therefore we need to set a minimum and maximum alignment length allowed
out <- readRDS("output/rds/blast_top_hit.rds")
hist(out$length)
allowed_lengths <- 220:500

joint <- out %>% 
  dplyr::select(OTU = qseqid, acc, blastspp = Species, pident, length, evalue) %>%
  left_join(tax %>% 
              seqateurs::unclassified_to_na(rownames=FALSE) %>%
              mutate(lowest = seqateurs::lowest_classified(.)), by="OTU") %>%
  dplyr::filter(length %in% allowed_lengths) 

#Write out comparison between BLAST and Heiarchial assignment
write_csv(joint, "output/csv/tax_assignment_comparison.csv")

gg.tophit <- joint %>%
  dplyr::select(pident, rank = lowest) %>%
  mutate(rank = factor(rank, levels = c("root", "domain", "phylum", "class", "order", "family", "genus", "species"))) %>%
  ggplot(aes(x=pident, fill=rank))+ 
  geom_histogram(colour="black", binwidth = 1, position = "stack") + 
  labs(title = "Top hit identity distribution",
       x = "BLAST top hit % identity",
       y = "OTUs") + 
  scale_x_continuous(breaks=seq(60,100,2)) +
  scale_fill_brewer(name = "Taxonomic \nAssignment", palette = "Spectral")

gg.tophit

pdf(paste0("output/figs/top_hit_tax_assignment.pdf"), width = 11, height = 8 , paper="a4r")
  gg.tophit
try(dev.off(), silent=TRUE)

```

# Make Phyloseq object

Following taxonomic assignment, the sequence table and taxonomic table are merged into a single phyloseq object alongside the sample info csv.

```{r create PS, eval = FALSE}
# Load data
seqtab <- t(readRDS("output/rds/seqtab_nocut.rds"))
#seqtab <- readRDS("output/rds/seqtab_curated.rds")
tax <- readRDS("output/rds/tax_IdTaxaExact_nocut.rds")

seqs <- DNAStringSet(rownames(seqtab))
names(seqs) <- seqs
samdf <- read_csv("sample_data/Sample_info3.csv") %>%
  as.data.frame(stringsAsFactors=FALSE) %>%
  magrittr::set_rownames(.$SampleID) 

# Make phyloseq object
ps <- phyloseq(tax_table(tax),
               sample_data(samdf),
               otu_table(t(seqtab), taxa_are_rows = FALSE),
               refseq(seqs))

if(nrow(seqtab) > nrow(sample_data(ps))){warning("Warning: All samples not included in phyloseq object, check sample names match the sample metadata")}

#Rename taxa
#taxa_names(ps) <- paste0("SV", seq(ntaxa(ps)),"-",tax_table(ps)[,8])

##save phyloseq object
saveRDS(ps, "output/rds/ps.rds")

#Output tables of results
dir.create("output/csv")
dir.create("output/otu_tables/unfiltered/", recursive = TRUE)

##Export raw csv
speedyseq::psmelt(ps) %>%
  dplyr::filter(Abundance > 0) %>%
  write.csv(file = "output/otu_tables/rawdata.csv")

# Export species level summary
seqateurs::summarise_taxa(ps, "species", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/unfiltered/spp_sum.csv")

# Export genus level summary
seqateurs::summarise_taxa(ps, "genus", "SampleID") %>%
  spread(key="SampleID", value="totalRA") %>%
  write.csv(file = "output/otu_tables/unfiltered/gen_sum.csv")

#Check carsonella presence
cars <- speedyseq::psmelt(ps) %>%
  filter(Abundance > 0) %>%
  group_by(psyllid_spp) %>%
  summarise(n = count(genus=="Candidatus Carsonella", na.rm = TRUE))

test <- speedyseq::psmelt(ps) %>%
  filter(Abundance > 0) %>%
  filter(genus=="Candidatus Carsonella")
```


## Taxon Filtering

Remove all non-bacterial taxa, mitochondria, chloroplast and cyanobacteria.
We also remove all taxa contained within the blank sample from other samples? - Check these

```{r taxon-cleaning}
get_taxa_unique(ps, "domain")
ntaxa(ps) # Check the number of taxa prior to removal
ps0 <- ps %>%
  subset_taxa(
    domain == "Bacteria" & 
    family  != "Mitochondria" &
    order   != "Chloroplast" &
    phylum != "Cyanobacteria"
  )
#Check taxa were removed
ntaxa(ps0)
get_taxa_unique(ps0, "phylum")
get_taxa_unique(ps0, "class")
get_taxa_unique(ps0, "family")
```

## Remove outlier Samples

Detecting and potentially removing samples outliers (those samples with underlying data that do not conform to experimental or biological expectations) can be useful for minimizing technical variance. This can be caused by a number of reasons, including low-reads assigned to that sample. In this case we remove all samples below 1000 reads, as these include all samples contributing to lower than usual ASV counts.

Rarefaction curves are useful	to	assess	sensitivity	of	sample	size	to	observed	alpha-diversity estimates.
```{r remove outliers}
## Remove mocks
rm_mocks <- c("mockA_S51", "MockEven_S193", "Mock_S192", "PCRctrl_S191", "MockStaggered_S194", "PCRctrl_S191", "91_S167")

#check mocks
ps1 <- ps0 %>% 
  subset_samples(!sample_names(ps0) %in% rm_mocks) %>% #Remove mocks
  filter_taxa(function(x) mean(x) > 0, TRUE) #Drop missing taxa from table 

message((nsamples(ps0) - nsamples(ps1)), " outlier samples dropped")

#Plot rarefaction curve
out <- rarecurve(otu_table(ps1), step=100)

rare <- lapply(out, function(x){
  b <- as.data.frame(x)
  b <- data.frame(OTU = b[,1], count = rownames(b))
  b$count <- as.numeric(gsub("N", "",  b$count))
  return(b)
})
names(rare) <- sample_names(ps1)

rare <- map_dfr(rare, function(x){
  z <- data.frame(x)
  return(z)
}, .id = "sample")

# read threshold for sample removal
threshold = 1000

gg.rare <- ggplot(data = rare)+
  geom_line(aes(x = count, y = OTU, group=sample), alpha=0.5)+
  geom_point(data = rare %>% 
               group_by(sample) %>% 
               top_n(1, count),
             aes(x = count, y = OTU, colour=(count > threshold))) +
  scale_x_continuous(labels =  scales::label_number_si()) +
  geom_vline(xintercept=threshold, linetype="dashed") +
  labs(colour = "Sample kept?") +
  xlab("Sequence reads") +
  ylab("Observed ASV's")

#Write out figure
pdf(file="figs/rarefaction.pdf", width = 11, height = 8 , paper="a4r")
  plot(gg.rare)
try(dev.off(), silent=TRUE)

#Remove all samples under the minimum read threshold 
ps2 <- prune_samples(sample_sums(ps1)>=threshold, ps1) 
ps2 <- filter_taxa(ps2, function(x) mean(x) > 0, TRUE) #Drop missing taxa from table
message(nsamples(ps1) - nsamples(ps2), " Samples and ", ntaxa(ps1) - ntaxa(ps2), " taxa under read threshold Dropped")
```


## Compare distances pre and post filtering

```{r compare dists}
seqtab_final <- readRDS("output/rds/seqtab_nocut.rds")
#Get pre_filt distances
pre_filt <- as.matrix(zCompositions::cmultRepl(seqtab_final, method="BL", output="p-counts"))
pre_filt_dist <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(pre_filt), method="euclidean"))

# Get post_filt distances
otutab <- otu_table(ps2) %>%
  as("matrix")

#otutab <- t(seqtab)
post_filt <- as.matrix(zCompositions::cmultRepl(otutab, method="BL", output="p-counts"))
post_filt_dist <- as.matrix(vegdist(CoDaSeq::codaSeq.clr(post_filt), method="euclidean"))

#Subset to only common samples
subsample <- intersect(colnames(pre_filt_dist), colnames(post_filt_dist))
as.data.frame(vegan::mantel(pre_filt_dist[subsample, subsample], post_filt_dist[subsample, subsample])[c("statistic","signif","permutations")])
```

### Output fastas

```{r Save fastas}
#Write out fasta and align with silva online
seqs <- refseq(ps2)
writeXStringSet(seqs, "output/curated_asv.fasta", width=1000)
```

## Align with SINA

```{bash align}
#Create a virtual environment
module load Python/3.8.2-GCCcore-9.3.0 
virtualenv ~/SINA
source ~/SINA/bin/activate

#Install sina
wget https://github.com/epruesse/SINA/releases/download/v1.7.0/sina-1.7.0-linux.tar.gz
tar xf sina-1.7.0-linux.tar.gz
cd sina-1.7.0-linux

#Get referenceDB
wget https://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_NR99_05_01_20_opt.arb.gz


#Run sina
~/sina-1.7.0-linux/sina --search --add-relatives=5 --meta-fmt=header --fields=acc \
-i /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/curated_asv.fasta \
-r /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/reference/SILVA_138_SSURef_NR99_05_01_20_opt.arb \
-o /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/aligned_asv.fasta
```

## Create phylogenetic tree with fasttreee
```{bash fastree}
module load FastTree
FastTree -gtr -cat 20 -quote -nt /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/aligned_asv.fasta > /group/pathogens/Alexp/Metabarcoding/Psyllid_microbiome/output/sina_tree 
```

## Date tree with PATHd8

```{r PATHd8}
# Date usign congruify
#tree <- read.tree("arb-silva.de_2020-07-28_id860849/arb-silva.de_2020-07-28_id860849.tree")
tree <- read.tree("output/sina_tree")
tree$tip.label <- tree$tip.label %>%
  str_split_fixed("\\[", n=Inf) %>%
  as.data.frame() %>%
  unite(V1, V2, col="name") %>%
  mutate(name = name %>%
           str_remove("_align.*$") %>%
           str_remove("^.*_acc=") %>%
           str_remove("\\]$")) %>%
  pull(name)

tree_pruned <- drop.tip(tree, tree$tip.label[duplicated(tree$tip.label)])

#Reference tree
ref_tree <- read.tree("bin/Dated trees/Bacteria_16S_SILVA_97sim_FastTree_PATHd8.tre")

ref_tree$tip.label <- ref_tree$tip.label %>% 
  str_extract("^(.*?)\\.") %>%
  str_remove("\\.$")

#2881  congruent tips
table(tree$tip.label %in% ref_tree$tip.label)

ref_tree_pruned <- drop.tip(ref_tree, c(ref_tree$tip.label[!ref_tree$tip.label %in% tree$tip.label],ref_tree$tip.label[duplicated(ref_tree$tip.label)]))

table(tree_pruned$tip.label %in% ref_tree_pruned$tip.label)

res <- congruify.phylo(reference=ref_tree_pruned, target=tree_pruned, scale="PATHd8")

tree2 <- drop.tip(res$phy, res$phy$tip.label[!res$phy$tip.label %in% names(refseq(ps2))])
write.tree(tree2, "output/phytree.nwk")
```


## Merge technical replicates

With only 16 samples replicated, and only on the first 2 sequencing runs i dont think there is a way to explicitly take into account replicate variability, therefore all replicates were merged

This can be done with DADA2 - mergeSequenceTables(st1, st2, st3, repeats = "sum"

```{r replicates}
# Merge replicates
ps.merged <- ps2 %>%
    merge_samples(group = "Sample_Name", fun="sum")


#This loses the sample metadata - Need to add it agian
sample_data(ps.merged) <- sample_data(ps2) %>%
  as("matrix") %>%
  as.data.frame() %>%
  filter(!duplicated(Sample_Name)) %>%
  magrittr::set_rownames(.$Sample_Name)

seqs <- refseq(ps2)
tree <- read.tree("output/phytree.nwk")

#make new phyloseq object
ps2 <- phyloseq(tax_table(ps.merged),
               sample_data(ps.merged),
               otu_table(otu_table(ps.merged), taxa_are_rows = FALSE),
               refseq(seqs),
               phy_tree(tree))


saveRDS(ps2, "output/rds/ps2.rds")
```

 

Copyright (C) 2020 Alexander M Piper

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/

alexander.piper@agriculture.vic.gov.au