从文本文件中提取一组行

def get_features(ID): featureList=[] #set and open link to uniprot webiste link="https://www.uniprot.org/uniprot/{}.txt".format(ID) file = urllib.request.urlopen(link) #find amino acid sequence for line in file: nextLine = next(file) #print(nextLine) if b'SQ' in line: print(line) #unsure how to extract more than 1 line #additionally, the number of lines that #I will need will be variable, depending on the protein length #this is what I think the extracted lines put into a string will look like aaSeq='MTQMLTRPDV\tDLVNGMFYAD\tGGAREAYRWM\tRANEPVFRDR\tNGLAAATTYQ\tAVLDAERNPE\nLFSSTGGIRP\tDQPGMPYMID' #remove \t and \n characters ActualSeq=re.sub('\s+', '', aaSeq) print(ActualSeq) #now iterate through the string to create dataframe? p=1 for i in ActualSeq: featureList.append([ID,p,i]) p+=1 return featureList seq=get_features('A0R4Q6') print(seq)

1条回答

网友

1楼 · 发布于 2024-05-23 23:07:12

要获得您请求的确切输出，请尝试以下操作：

def get_features(ID):
    featureList=[]

    # Set and open link to uniprot webiste
    link="https://www.uniprot.org/uniprot/{}.txt".format(ID) 
    file = urllib.request.urlopen(link)

    found_seq = False
    full_sec = ''
    
    # Find amino acid sequence
    for line in file:
      if line.startswith(b'SQ   '):
        found_seq = True
      elif found_seq and line.startswith(b'     '):
        line = ''.join(line.decode("utf-8").split())
        # print(line)
        full_sec += line
      else:
        found_seq = False

    # Enumerate items
    for i, a in enumerate(full_sec):
      featureList.append([ID, i+1, a])
    return featureList


seq = get_features('A0R4Q6')

for item in seq:
  print(item)

它将打印以下内容：

['A0R4Q6', 1, 'M']
['A0R4Q6', 2, 'T']
['A0R4Q6', 3, 'Q']
['A0R4Q6', 4, 'M']
['A0R4Q6', 5, 'L']
...

相关问题更多 >

编程相关推荐

热门问题

热门文章