在python中将文本文件解析为csv

12: IBD08; ANALYSIS AND CHARACTERISATION OF THE FAECAL MICROBIAL DEGRADOME IN INFLAMMATORY BOWEL DISEASE Identifiers: BioSample: SAMEA3914946; SRA: ERS1102080 Organism: Homo sapiens Attributes: /sample name="ERS1102080" /collection date="2011" /environment biome="Intestine" /environment feature="Colon" /environment material="Faecal" /geographic location (country and/or sea)="United Kingdom" /host body product="Faeces" /host disease status="Healthy" /human gut environmental package="human-gut" /investigation type="metagenome" /latitude (raw)="51??31'03.3" /longitude (raw)="0??10'25.2" /project name="IBD gut" /sequencing method="Illumina Miseq" Description: Multi 'omic analysis of the gut microbiome in IBD Accession: SAMEA3914946 ID: 5788180 2: qiita_sid_833:833.Sweden.IBD.102A; 833.Sweden.IBD.102A Identifiers: BioSample: SAMEA3924619; SRA: ERS1111753 Organism: gut metagenome Attributes: /sample name="ERS1111753" /sex="male" /age="3.9" /age group="2.0" /age unit="years" /altitude="0" /anonymized name="Sweden.IBD.102A" /antibiotics="definite_no" /assigned from geo="False" /barcodesequence="CTGCTATTCCTC" /body habitat="UBERON:feces" /body product="UBERON:feces" /tissue="UBERON:feces" /breed="Great_Dane" /breed grouping="Working" /collection date="1/30/12" /collection timestamp="1/30/12" /common name="gut metagenome" /geographic location="Sweden: GAZ" /depth="0" /disease="IBD" /dna extracted="True" /elevation="13.02" /emp status="NOT_EMP" /environment biome="ENVO:urban biome" /environment feature="ENVO:animal-associated habitat" /env matter="ENVO:feces" /experiment center="Texas A&M" /experiment design description="Fecal samples from dogs of various breeds, places of origin, and severity of bowel disorder were sequencing to obtain a dog gut metagenome." /experiment title="suchodolski_dog_ibd" /gender specific="M" /has extracted data="True" /has physical specimen="True" /histo="both" /host="domestic dog" /host="Canis lupus familiaris" /host subject id="Sweden.IBD.102A" /host taxonomy ID="9615" /illumina technology="HiSeq" /latitude="60.13" /library construction protocol="This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers F515 and R806 were developed against the V4 region of the 16S rRNA, both bacteria and archaea, which we determined would yield optimal community clustering with reads of this length The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions." /linker="GT" /linkerprimersequence="GTGCCAGCMGCCGCGGTAA" /longitude="18.64" /pcr primers="FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT" /physical location="CCME" /physical specimen location="Texas A&M" /physical specimen remaining="False" /platform="Illumina" /platformchemistry="HiSeq_V4" /pool name="R.K.1.20.12" /primer plate="1" /public="False" /required sample info status="completed" /run center="CCME" /run date="1/30/12" /run prefix="Suchodolski_dog_ibd" /sample size="0.1, gram" /sample center="Texas A&M" /sample plate="IBD1" /sequencing meth="sequencing by synthesis" /size grouping="large" /study center="Texas A&M" /target gene="16S rRNA" /target subfragment="V4" /title="Suchodolski_dog_ibd" /total mass="54.0" /weight group="5.0" /weight kg="54.0" /well id="H6" Description: IBD1_Sweden_IBD_102A_H6_R.K.1.20.12 Accession: SAMEA3924619 ID: 5507372

1条回答

网友

1楼 · 发布于 2024-06-09 02:29:14

您的两个示例具有非常不同的字段，但您仍然可以创建一个CSV，其中包含您需要的所有字段，如下所示：

from itertools import groupby, takewhile, ifilter
import re
import csv

heading = None
sub_headings = ['Identifiers', 'Organism']
attribute_fields = []

# First scan to determine list of all used attribute_fields
with open('projects.txt') as f_projects:
    re_attributes = re.compile(r'    \/(.*?)=".*"')

    for line in f_projects:
        # '    /sample size="0.1, gram"'
        re_attribute = re_attributes.match(line)

        if re_attribute:
            attribute_fields.append(re_attribute.group(1))

# Remove duplicate attributes, sort and prefix the top fields
attribute_fields = ['Description', 'id', 'Accession', 'AccessionID'] + sorted(set(attribute_fields))    

with open('projects.txt') as f_projects, open('output.csv', 'wb') as f_output:
    csv_output = csv.DictWriter(f_output, fieldnames=sub_headings + attribute_fields)
    csv_output.writeheader()

    skip_empty_lines = ifilter(lambda x: len(x.strip()), f_projects)

    for k, v in groupby(skip_empty_lines, lambda x: re.match('\d+: ', x)):
        if k:
            heading = next(v).strip()
        elif heading:
            row = {'id' : heading}
            lines = list(v)

            for line_number, line in enumerate(lines):
                for sub_heading in sub_headings:
                    if line.startswith(sub_heading):
                        row[sub_heading] = line.split(':', 1)[1].strip()

                if line.startswith('Attributes:'):
                    for attribute in takewhile(lambda x: x.startswith('    /'), iter(lines[line_number+1:])):
                        k, v = re.findall(r'/(.*?)="(.*?)"', attribute)[0]
                        row[k] = v

                if line.startswith('Description:'): 
                    row['Description'] = lines[line_number+2].strip() # use next line only

                # Accession: SAMN00030407\tID: 30407
                if line.startswith('Accession:'):
                    accession, accession_id = re.match('Accession: (.*?)\tID: (.*?)$', line).groups()
                    row.update({'Accession':accession, 'AccessionID':accession_id})

            csv_output.writerow(row)

这将生成一个相当稀疏的输出CSV，如下所示：

^{pr2}$

在Python2.7.12上测试

相关问题更多 >

编程相关推荐

热门问题

热门文章