Objectif¶
Est il possible de déterminer le sexe du veau à venir par une méthode plus facile et plus tôt dans la période de gestation.
Idée : séquençage de l'ADN circulant après une simple prise de sang et détection de séquences originaires du chromosome Y
Partenaire¶
Dominique Rocha, GABI
Mat & Meth¶
data¶
séquençage de 31 échantillons de vache en gestation ayant donné naissance à des mâles ou à des femelles.
séquençage 2x151pb mais avec une forte variabilité en nombre de séquence et en taille:
- entre 26 589 séquences et 840 178 séquences (moyenne à 342 051)
- entre 37 et 151pb (taille largement majoritaire)
rapport fastqc R1 : report_R1.html
rapport fastqc R2 : report_R2.html
version :FastQC_v0.11.7 MultiQC-v1.7
génome de référence à utiliser : https://www.ncbi.nlm.nih.gov/assembly/GCA_000003205.6/
méthodes¶
1. masquage du génome de référence¶
téléchargement des régions répétées : https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/003/205/GCA_000003205.6_Btau_5.0.1/GCA_000003205.6_Btau_5.0.1_rm.out.gz
module load bioinfo/bedtools2-2.29.0 zcat ref_bwa1/GCA_000003205.6_Btau_5.0.1_rm.out.gz |tail -n +4 | awk '{print $5"\t"$6"\t"$7}' > ref_bwa1/repeat.bed bedtools maskfasta -soft -fi ref_bwa1/GCA_000003205.6_Btau_5.0.1_genomic.fna -bed ref_bwa1/repeat.bed -fo ref_bwa1/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna
2. alignement¶
alignement des séquences sur le génome de référence masqué
module load bioinfo/bwa-0.7.17 module load bioinfo/samtools-1.10 bwa index ref_bwa1/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna mkdir bwa1 c=1 for R1 in data/*_R1_*.fastq.gz do sample=`echo $R1 | cut -f 1 -d "_" | cut -f 2 -d '/'` R2=`echo $R1 | sed 's/_R1_/_R2_/'` job_ID=`sbatch -J bwa1_$sample -o LOG/%x.out -e LOG/%x.err --cpus-per-task=8 --mem=10G --wrap="bwa mem -t8 -R '@RG\tID:$c\tSM:$sample\tPL:Illumina\tPU:PU\tLB:LB' -o bwa1/$sample.sam ref_bwa1/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna $R1 $R2" | awk '{print $NF}'` sbatch -J bwa1_seff_$sample -o LOG/%x.out -e LOG/%x.err --dependency=afterok:$job_ID --wrap="seff $job_ID > bwa1/$sample.seff" job_ID2=`sbatch -J sort_bam_$sample -o LOG/%x.out -e LOG/%x.err --cpus-per-task=8 --mem=10G --dependency=afterok:$job_ID --wrap="samtools view -Sh bwa1/$sample.sam -b | samtools sort -@ 8 -o bwa1/$sample.bam "|awk '{print $NF}'` sbatch -J bwa1_bamIdx_$sample -o LOG/%x.out -e LOG/%x.err --dependency=afterok:$job_ID2 --wrap="samtools index bwa1/$sample.bam " let c=$c+1 done
flagstat pour compter le nombre de lectures alignées
for i in bwa1/*.bam do sample=`basename $i | sed 's/\.bam//'` sbatch -J bwa1_flagstat_$sample -o LOG/%x.out -e LOG/%x.err --wrap="samtools flagstat $i > bwa1/`basename $i | sed 's/\.bam/\.flagstat/'`" sbatch -J bwa1_flagstat_chrY_$sample -o LOG/%x.out -e LOG/%x.err --wrap="samtools view -hb $i CM001061.2 | samtools flagstat - > bwa1/`basename $i | sed 's/\.bam/\.chrY_flagstat/'`" sbatch -J bwa1_flagstat_properly_only_chrY_$sample -o LOG/%x.out -e LOG/%x.err --wrap="samtools view -h -f2 $i CM001061.2 | grep -v XA:Z |samtools view -Shb - | samtools flagstat - > bwa1/`basename $i | sed 's/\.bam/\.properly_onlyChrY_flagstat/'`" sbatch -J bwa1_flagstat_properly_q30_only_chrY_$sample -o LOG/%x.out -e LOG/%x.err --wrap="samtools view -h -f2 -q 30 $i CM001061.2 | grep -v XA:Z |samtools view -Shb - | samtools flagstat - > bwa1/`basename $i | sed 's/\.bam/\.q30_properly_onlyChrY_flagstat/'`" done # fichier bilan echo -e "#sample_name\tCPU_time\tMEM_used\tnb_reads\tnb_read_mapped\tnb_read_properly_mapped\tnb_read_mapped_on_chrY\tpercent_read_mapped_on_chrY\tnb_read_properly_mapped_only_on_chrY\tpercent_read_properly_mapped_only_on_chrY\tnb_read_mapq30_properly_mapped_only_on_chrY" > bwa1/nb_aln.tsv for i in bwa1/*.bam do sample=`basename $i | sed 's/\.bam//'` cpu=`grep "CPU Utilized:" bwa1/$sample.seff | awk '{print $NF}'` mem=`grep "Memory Utilized:" bwa1/$sample.seff | awk '{print $3" "$4}'` tot_read=`grep read1 bwa1/$sample.flagstat | awk '{print $1*2}'` tot_mapped=`grep -m1 mapped bwa1/$sample.flagstat | awk '{print $1}'` tot_properly_paired=`grep "properly paired" bwa1/$sample.flagstat | awk '{print $1}'` tot_mapped_chrY=`grep -m1 mapped bwa1/$sample.chrY_flagstat | awk '{print $1}'` percent_mapped_chrY=`echo $tot_mapped $tot_mapped_chrY | awk '{print $2*100/$1}'` tot_mapped_prop_only_chrY=`grep -m1 mapped bwa1/$sample.properly_onlyChrY_flagstat | awk '{print $1}'` percent_mapped_prop_only_chrY=`echo $tot_mapped_chrY $tot_mapped_prop_only_chrY | awk '{print $2*100/$1}'` tot_mapped_q30_prop_only_chrY=`grep -m1 mapped bwa1/$sample.q30_properly_onlyChrY_flagstat | awk '{print $1}'` echo -e "$sample\t$cpu\t$mem\t$tot_read\t$tot_mapped\t$tot_properly_paired\t$tot_mapped_chrY\t$percent_mapped_chrY\t$tot_mapped_prop_only_chrY\t$percent_mapped_prop_only_chrY\t$tot_mapped_q30_prop_only_chrY" >> bwa1/nb_aln.tsv done
Pour les séquences proprement pairées et alignées uniquement sur le chromosome Y avec une mapping quality >= 30, réalignement blast pour avoir une sortie "alignement"
module load bioinfo/ncbi-blast-2.7.1+ mkdir ref_blast ln -s ../ref_bwa1/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna ref_blast/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna sbatch -J index_blast -o LOG/%x.out -e LOG/%x.err --mem=10G --wrap="makeblastdb -in ref_blast/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna -dbtype nucl" mkdir blast # extraction des reads for i in bwa1/*.bam do echo $i samtools view -h -f2 -q 30 $i CM001061.2 | grep -v XA:Z |samtools view -Shb - | samtools fasta - > blast/`basename $i | sed 's/\.bam/\.fasta/'` done # alignement for i in blast/*.fasta do sbatch -J blast_`basename $i |sed 's/.fasta//'` -o LOG/%x.out -e LOG/%x.err --wrap="blastn -query $i -out `echo $i | sed 's/\.fasta/\.blast/'` -db ref_blast/GCA_000003205.6_Btau_5.0.1_genomic.mask.fna" done
Résultats¶
#sample_name | CPU_time | MEM_used | nb_reads | nb_read_mapped | nb_read_properly_mapped | nb_read_mapped_on_chrY | percent_read_mapped_on_chrY | nb_read_properly_mapped_only_on_chrY | percent_read_properly_mapped_only_on_chrY | nb_read_mapq30_properly_mapped_only_on_chrY |
Emy10 | 00:23:56 | 5.62 GB | 1680356 | 1668864 | 1548578 | 586 | 0.0351137 | 399 | 68.0887 | 16 |
Emy11 | 00:12:40 | 5.74 GB | 635726 | 652803 | 526810 | 145 | 0.0222119 | 87 | 60 | 2 |
Emy12 | 00:06:45 | 5.29 GB | 354868 | 362510 | 274570 | 89 | 0.024551 | 40 | 44.9438 | 0 |
Emy13 | 00:08:37 | 5.79 GB | 450848 | 343499 | 320264 | 254 | 0.0739449 | 67 | 26.378 | 6 |
Emy14 | 00:22:33 | 6.76 GB | 1401822 | 1081970 | 1043656 | 880 | 0.0813331 | 286 | 32.5 | 10 |
Emy15 | 00:26:29 | 6.48 GB | 1022108 | 1097508 | 779532 | 509 | 0.0463778 | 264 | 51.8664 | 10 |
Emy16 | 00:21:32 | 6.44 GB | 569334 | 598064 | 406594 | 158 | 0.0264186 | 70 | 44.3038 | 0 |
Emy17 | 00:10:53 | 6.01 GB | 367904 | 414962 | 305324 | 164 | 0.0395217 | 86 | 52.439 | 0 |
Emy18 | 00:10:42 | 6.11 GB | 410440 | 417769 | 375546 | 82 | 0.0196281 | 50 | 60.9756 | 2 |
Emy19 | 00:38:21 | 7.59 GB | 1110116 | 1280700 | 926940 | 434 | 0.0338877 | 220 | 50.6912 | 6 |
Emy1 | 00:33:00 | 7.07 GB | 1550226 | 1701654 | 1305580 | 385 | 0.022625 | 189 | 49.0909 | 8 |
Emy20 | 00:02:56 | 4.92 GB | 188702 | 181417 | 174984 | 42 | 0.0231511 | 23 | 54.7619 | 2 |
Emy21 | 00:02:25 | 4.65 GB | 147720 | 131763 | 126294 | 83 | 0.0629919 | 20 | 24.0964 | 0 |
Emy22 | 00:01:42 | 4.63 GB | 79774 | 69844 | 63568 | 40 | 0.0572705 | 18 | 45 | 2 |
Emy23 | 00:12:29 | 6.02 GB | 443806 | 413871 | 318000 | 220 | 0.0531567 | 105 | 47.7273 | 4 |
Emy24 | 00:25:16 | 6.43 GB | 1042714 | 1040747 | 797926 | 296 | 0.0284411 | 137 | 46.2838 | 2 |
Emy25 | 00:24:36 | 6.73 GB | 801218 | 783511 | 603822 | 371 | 0.047351 | 157 | 42.3181 | 8 |
Emy26 | 00:08:29 | 5.53 GB | 390056 | 383760 | 308664 | 105 | 0.0273609 | 36 | 34.2857 | 0 |
Emy27 | 00:03:08 | 4.84 GB | 201608 | 200321 | 186110 | 45 | 0.0224639 | 25 | 55.5556 | 0 |
Emy28 | 00:15:32 | 5.76 GB | 1001470 | 1012655 | 890862 | 219 | 0.0216263 | 112 | 51.1416 | 6 |
Emy29 | 00:26:00 | 6.00 GB | 1589610 | 1673287 | 1363234 | 381 | 0.0227696 | 263 | 69.0289 | 11 |
Emy2 | 00:01:00 | 1.38 MB | 53178 | 50349 | 47714 | 18 | 0.0357505 | 9 | 50 | 0 |
Emy30 | 00:08:01 | 5.43 GB | 434296 | 342089 | 302992 | 333 | 0.0973431 | 79 | 23.7237 | 13 |
Emy31 | 00:01:58 | 4.69 GB | 429156 | 145136 | 141392 | 223 | 0.153649 | 66 | 29.5964 | 12 |
Emy3 | 00:15:46 | 6.27 GB | 599506 | 635532 | 486484 | 202 | 0.0317844 | 78 | 38.6139 | 1 |
Emy4 | 00:15:49 | 6.21 GB | 573952 | 650761 | 380622 | 185 | 0.0284283 | 82 | 44.3243 | 2 |
Emy5 | 00:08:51 | 5.70 GB | 491172 | 511635 | 375054 | 134 | 0.0261905 | 54 | 40.2985 | 2 |
Emy6 | 00:16:26 | 6.05 GB | 682338 | 804165 | 377532 | 257 | 0.0319586 | 110 | 42.8016 | 2 |
Emy7 | 00:17:35 | 6.12 GB | 749392 | 835071 | 532060 | 238 | 0.0285006 | 128 | 53.7815 | 4 |
Emy8 | 00:06:12 | 5.37 GB | 270596 | 246936 | 204308 | 162 | 0.065604 | 74 | 45.679 | 8 |
Emy9 | 00:28:43 | 6.53 GB | 1483164 | 1436478 | 1328500 | 365 | 0.0254094 | 221 | 60.5479 | 6 |