【6.2.4】CHOPCHOP的部署

一、ChopChop

1.1 安装软件

cd /data/user/sam/project/crispr/lib
git clone https://bitbucket.org/valenlab/chopchop.git

cd chopchop

# 安装沙箱
virtualenv --no-site-packages venv

# 启动沙箱
source venv/bin/activate

pip install biopython==1.76
pip install pandas
pip install scipy
pip install numpy
pip install argparse
pip install scikit-learn==0.18.1

pip install mysql-python==1.2.3

1.2 下载数据库

基因组位置:

cd /data/database/homo
wget -c https://chopchop.cbu.uib.no/genomes/hg38.1.ebwt --no-check-certificate;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.2.ebwt;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.2bit;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.3.ebwt;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.4.ebwt;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.rev.1.ebwt;
wget -c --no-check-certificate https://chopchop.cbu.uib.no/genomes/hg38.rev.2.ebwt

1.3 下载基因组(这一步可以不用,因为上面已经下载数据库了)

cd /data/database/homo

wget -c http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz gzip -d hg19.fa.gz /data/software/tools/bowtie-1.0.1/bowtie-build -f hg19.fa hg19

wget -c http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit

cd /data/database/genome/hg38 wget -c http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz

1.4 gtfToGenePred

见上一篇博文

1.5 配置config

修改程序路径

vim config.json

{
  "PATH": {
    "PRIMER3": "/home/sam/project/crispr/lib/chopchop/primer3_core",
    "BOWTIE": "/home/sam/project/crispr/lib/chopchop/bowtie/bowtie",
    "TWOBITTOFA": "/home/sam/project/crispr/lib/chopchop/twoBitToFa",
    "TWOBIT_INDEX_DIR": "/data/database/homo",
    "BOWTIE_INDEX_DIR": "/data/database/homo",
    "ISOFORMS_INDEX_DIR": "/your/full/path/to/ebwt_transcriptome_folder_and_2bit_of_genome",
    "ISOFORMS_MT_DIR": "/your/full/path/to/vienna_MT_folder",
    "GENE_TABLE_INDEX_DIR": "/data/database/homo/genepred"
  },
  "THREADS": 1
}

二、使用和测试

source /home/sam/project/crispr/lib/chopchop/venv/bin/activate

/home/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o temp -Target chr10:1000000-1001000

/home/sam/project/crispr/lib/chopchop/chopchop.py -G hg19 -o temp -Target NM_144906

cd /data/user/sam/project/crispr/lib/chopchop/tests

/data/user/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o TRAC -Target TRAC  >trac.tsv

2.1 具体例子

cd /home/sam/project/crispr/lib/chopchop/tests

/home/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o CD52 -Target CD52  --PAM  NAG 

/data/user/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o CD52 -Target CD52  --PAM  NAG  

/data/user/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o CD52   --PAM  NAG --fasta CD52/sequence.fa

/home/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o CD52 -Target CD52 --PAM NGG -g 17 --scoringMethod DOENCH_2016 --consensusUnion

三、参数说明

source /home/sam/project/crispr/lib/chopchop/venv/bin/activate

/data/user/sam/project/crispr/lib/chopchop/chopchop.py -G $(species)  -o $(target) -Target $(target) --target $(region) --targetDownstreamPromoter $(DownstreamPromoter)  --targetUpstreamPromoter $(UpstreamPromoter)   --PAM $(PAM) --filterGCmin $(filterGCmin) --filterGCmax $(filterGCmax)  -filterSelfCompMax $(filterSelfCompMax) --maxMismatches $(maxMismatches)  --scoringMethod $(scoringMethod) --guideSize $(guideSize) >result.txt


 /data/user/sam/project/crispr/lib/chopchop/chopchop.py -G hg38 -o CD52   --PAM  NAG -Target CD52 -filterSelfCompMax  0 --filterGCmin 10 --filterGCmax 80  -filterSelfCompMax 0 --maxMismatches 3 --scoringMethod DOENCH_2016

–consensusUnion 加上这个额参数,就以为着consensusUnion # False union | intersection consensusUnion

–PAM NAG >result.tsv

说明:

1. suport RefSeq/ENSEMBL/gene name or genomic coordinates input .
2. "-Target", "--targets", type=str, help="Target genes or regions", required=True
"-G", "--genome", default="danRer7", metavar="GENOME", help="The genome to search."
"-T", "--MODE", default=1, type=int, choices=[1, 2, 3, 4], help="Set mode (int): default is Cas9 = 1, Talen = 2, Cpf1 = 3, Nickase = 4"

"-t", "--target", default="CODING", dest="targetRegion", help="Target the whole gene CODING/WHOLE/UTR5/UTR3/SPLICE   / PROMOTER. Default is CODING.")


"-TDP", "--targetDownstreamPromoter", default=200, type=int, help="how many bp to target downstream of TSS"
"-TUP", "--targetUpstreamPromoter", default=200, type=int, help="how many bp to target upstream of TSS"

-e", "--exon", help="Comma separated list of exon indices. Only find sites in this subset. ", metavar="EXON_NUMBER", dest="exons")

"-consensusUnion", "--consensusUnion", default=False, action="store_true", help="When calculating consensus sequence from multiple isoforms default uses intersection. This option specifies union of isoforms.")


"-filterGCmin", "--filterGCmin", default=0, type=int, help="Minimum required GC percentage. Default is 0.")
"-filterGCmax", "--filterGCmax", default=100, type=int, help="Maximum allowed GC percentage. Default is 100.")
-filterSelfCompMax FILTERSELFCOMPMAX, --filterSelfCompMax FILTERSELFCOMPMAX
                        Maximum acceptable Self-complementarity score. Default
                        is -1, no filter.
                        
    
-g GUIDE_SIZE, --guideSize GUIDE_SIZE
                        The size of the guide RNA.
 
"-M", "--PAM", type=str, help="The PAM motif."
"-F", "--fasta", default=False, action="store_true", help="Use FASTA file as input rather than gene or genomic region.")

"-v", "--maxMismatches", default=3, type=int, choices=[0, 1, 2, 3], metavar="MAX_MISMATCHES", help="The number of mismatches to check across the sequence.")

"-m", "--maxOffTargets", metavar="MAX_HITS", help="The maximum number of off targets allowed."


-scoringMethod {XU_2015,DOENCH_2014,DOENCH_2016,MORENO_MATEOS_2015,CHARI_2015,G_20,KIM_2018,ALL}, --scoringMethod {XU_2015,DOENCH_2014,DOENCH_2016,MORENO_MATEOS_2015,CHARI_2015,G_20,KIM_2018,ALL}
                        Scoring used for Cas9 and Nickase. Default is G_20

"-repairPredictions", "--repairPredictions", default=None, type=str,
                        choices=['mESC', 'U2OS', 'HEK293', 'HCT116', 'K562'], help="Use inDelphi from Shen et al 2018 to predict repair profiles for every guideRNA, this will make .repProfile and .repStats files")
                                    
 -isoforms, --isoforms
                        Search for offtargets on the transcriptome.

参考资料

药企,独角兽,苏州。团队长期招人,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn