Question

我正在尝试在将数据批量处理为Neo4j之前将数据解析为csv。我在关系中使用了大量数据，想知道是否有人可以帮助加快下面的交易。它是基因组数据，有2000个样本，每个样本每个染色体最多有3.5m个变异体，可能相当于约40m个变异体，每个样本最多有两行，因此总共约有12,000,000,000行。目前，每个样品大约要花20分钟。有没有人对如何改善这一点有任何建议：

import sys
import time
import datetime
import numpy as np
import allel
import zarr
import numcodecs
import os
import pandas
import csv
import math
import dask.array as da
vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'
zarr_path = vcf_directory + 'chroms.zarr'
callset = zarr.open_group(zarr_path, mode='r')
samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'
panel = pandas.DataFrame.from_csv(samples_fn, sep='\t')
chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])
for chrom in chrom_list:
    print(chrom)
    datetime_object = datetime.datetime.now()
    print(datetime_object)
    sampledata = callset[chrom]['samples']
    samples = list(sampledata)
    variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')
    pos = variants['POS'][:]
    SNPid = variants['ID'][:]
    ref = variants['REF'][:]
    alt = variants['ALT'][:]
    dp = variants['DP'][:]
    ac = variants['AC'][:]
    vartype = variants['TYPE'][:]
    svlen = variants['svlen'][:]
    qual = variants['QUAL'][:]
    vq = variants['VQSLOD'][:]
    numalt = variants['numalt'][:]
    csq = variants['CSQ'][:]
    vcfv = 'VCFv4.1'
    refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'
    calldata = callset[chrom]['calldata']
    dpz = calldata['DP']
    psz = calldata['PS']
    plz = calldata['PL']
    gpz = calldata['GP']
    gtz = calldata['GT']
    i = 0
    j = 0
    seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
    for i in range(50):
        print("Chrom " + str(chrom) + "sample" + str(i))
        print(datetime_object)
        gt = gtz[:,i].tolist()
        hap1, hap2 = zip(*gt)
        dp = dpz[:, i]
        ps = psz[:, i]
        pl = plz[:, i]
        gp = gpz[:, i]
        subject = samples[i]
        for j in range(len(pos)): 
            h1 = int(hap1[j])
            h2 = int(hap2[j])
            read_depth = int(dp[j])
            ps1 = int(ps[j])
            PL0 = int(pl[j][0])
            PL1 = int(pl[j][1])
            PL2 = int(pl[j][2])        
            GP0 = float(gp[j][0])
            GP1 = float(gp[j][1])
            GP2 = float(gp[j][2])

            if h1 == 0 and h2 == 0:
                filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
            elif h1 == 0 and h2 > 0:  
                filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
                filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
            elif h1 > 0 and h2 == 0:
                filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
                filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
            elif h1 == h2 and h1 > 0:
                filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])     
            else:
                filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
                filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

这一切都是临时完成的，我还是python的新手，所以任何提示都将不胜感激。

加快迭代python

0 个答案: