使用Selenium和BeautifulSoup提取NCBI RefSeq和提交的GenBank组装号

0 投票
1 回答
108 浏览
提问于 2025-04-13 20:14

标题:使用物种和菌株名称提取GenBank登录号的困难,使用网页抓取(使用BeautifulSoup或Selenium) 根据这篇帖子,我正在尝试使用Python中的Selenium和BeautifulSoup从一个网页提取NCBI RefSeq和提交的GenBank组装登录号。但是,我遇到了一个问题,就是之前的代码在处理只有一个组装的基因组时不起作用,因为它会打开一个不同的页面。

为了解决这个问题,我尝试了另一种方法:

这段代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"

# Open a Chrome browser
driver = webdriver.Chrome()

# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"

# Navigate to the search URL
driver.get(search_url)

try:
    # Wait for the main content to be visible
    main_content = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "maincontent")))

    # Find the assembly information
    assembly_info = main_content.text if main_content else "Assembly information not found"
    #print(assembly_info)

    # Extract GenBank and RefSeq assembly IDs if the assembly widget is present
    try:
        assembly_table = driver.find_element(By.CLASS_NAME, "assembly-widget")
        rows = assembly_table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) == 3:
                label = cells[1].text.strip()
                assembly_id = cells[2].text.strip()
                if label == "NCBI RefSeq assembly":
                    print("NCBI RefSeq assembly:", assembly_id)
                elif label == "Submitted GenBank assembly":
                    print("Submitted GenBank assembly:", assembly_id)
    except NoSuchElementException:
        print("Assembly information widget not found.")

except TimeoutException:
    print("Elements not found or timed out waiting for them to appear.")

# Initialize variables to store assembly IDs
genbank_assembly = None
refseq_assembly = None

# Split the assembly information into lines and iterate over them
lines = assembly_info.split("\n")
for i in range(len(lines)):
    if "NCBI RefSeq assembly" in lines[i]:
        refseq_assembly = lines[i+1].strip()
    elif "Submitted GenBank assembly" in lines[i]:
        genbank_assembly = lines[i+1].strip()

# Print the assembly IDs if found
if refseq_assembly:
    print("NCBI RefSeq assembly:", refseq_assembly)
if genbank_assembly:
    print("Submitted GenBank assembly:", genbank_assembly)


# Close the browser
driver.quit()

输出是

Assembly information widget not found.
NCBI RefSeq assembly: GCF_036226945.1
Submitted GenBank assembly: GCA_036226945.1

但是,这段代码打开了整个页面,所以我处理提取RefSeq和GenBank登录号。

不过,我觉得这不是一个好的方法,应该有更好的方式来实现这个目标。

我找到的另一种方法是

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"

# Open a Chrome browser
driver = webdriver.Chrome()

try:
    # Construct the search URL for assembly
    search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"

    # Navigate to the search URL
    driver.get(search_url)

    # Find elements containing the organism name
    elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'NCBI RefSeq assembly')]") #{search_term}

    if elements:
        print(f"Text '{search_term}' found on the webpage.")
        # Loop through elements containing the organism name
        for element in elements:
            # Find the parent element of the matched element
            parent_element = element.find_element(By.XPATH, "..") # for sibling"following-sibling::*[1]" #for parents ".." and for grand parents "../.." 
            # Print the text content of the parent element
            print("Parent element:")
            print(parent_element.text)
            
            
    else:
        print(f"Text '{search_term}' not found on the webpage.")

except Exception as e:
    print("An error occurred:", e)

finally:
    # Quit the browser
    driver.quit()

但我想用之前的方法来处理这个页面,标题:使用物种和菌株名称提取GenBank登录号的困难,使用网页抓取(使用BeautifulSoup或Selenium),这样我就可以在一个脚本中收集所有信息。

有人能给我建议一个合适的代码来实现这个吗?请帮帮我。

提前谢谢你!

1 个回答

1

我把你的代码简化了一下,得到了这段可以正常运行的代码。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"

# Open a Chrome browser
driver = webdriver.Chrome()

# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"

# Navigate to the search URL
driver.get(search_url)

wait = WebDriverWait(driver, 10)

# Extract GenBank and RefSeq assembly IDs if the assembly widget is present
try:
    refseq_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='NCBI RefSeq assembly']//following::dd/span"))).text
    print("NCBI RefSeq assembly:", refseq_assembly)
    genbank_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='Submitted GenBank assembly']//following::dd/span"))).text
    print("Submitted GenBank assembly:", genbank_assembly)
except TimeoutException:
    print("Elements not found or timed out waiting for them to appear.")

# Close the browser
driver.quit()

它会输出

NCBI RefSeq assembly: GCF_036226945.1      
Submitted GenBank assembly: GCA_036226945.1

撰写回答