美化组与Python webscraping时chrome inspect不匹配

import string import requests from bs4 import BeautifulSoup def getSequence(): searchProt = input("Enter a Protein Name!:") if searchProt != '': searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt page = requests.get(searchString) soup = BeautifulSoup(page.text, 'html.parser') soup = str(soup) accIndex = soup.find("a") accessionStart = soup.find('<dd>',accIndex) accessionEnd = soup.find('</dd>', accessionStart + 4) accession = soup[accessionStart + 4: accessionEnd] newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession try: newPage = requests.get(newSearchString) #This is where it fails newSoup = BeautifulSoup(newPage.text, 'html.parser') aaList = [] spaceCount = newSoup.count("ff_line") print(spaceCount) for i in range(spaceCount): startIndex = newSoup.find("ff_line") startIndex = newSoup.find(">", startIndex) + 2 nextAA = newSoup[startIndex] while nextAA in string.ascii_lowercase: aaList.append(nextAA) startIndex += 1 nextAA = newSoup[startIndex] return aaList except: print("Please Enter a Valid Protein")

1条回答

网友

1楼 · 发布于 2024-04-26 14:20:33

这段代码将使用Selenium提取您想要的蛋白质序列。我已经修改了你的原始代码以获得你想要的结果。在

from bs4 import BeautifulSoup
from selenium import webdriver
import requests

driver = webdriver.Firefox()

def getSequence():
    searchProt = input("Enter a Protein Name!:")
    if searchProt != '':
        searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
        page = requests.get(searchString)
        soup = BeautifulSoup(page.text, 'html.parser')
        soup = str(soup)
        accIndex = soup.find("a")
        accessionStart = soup.find('<dd>',accIndex)
        accessionEnd = soup.find('</dd>', accessionStart + 4)
        accession = soup[accessionStart + 4: accessionEnd]
        newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
        try:
            driver.get(newSearchString)
            html = driver.page_source
            newSoup = BeautifulSoup(html, "lxml")
            ff_tags = newSoup.find_all(class_="ff_line")
            aaList = []
            for tag in ff_tags:
                aaList.append(tag.text.strip().replace(" ",""))
            protSeq = "".join(aaList)
            return protSeq
        except:
            print("Please Enter a Valid Protein")

sequence = getSequence()
print(sequence)

为输入“p53”生成以下输出：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章