使用Selenium和BeautifulSoup提取NCBI RefSeq和提交的GenBank组装号
标题:使用物种和菌株名称提取GenBank登录号的困难,使用网页抓取(使用BeautifulSoup或Selenium) 根据这篇帖子,我正在尝试使用Python中的Selenium和BeautifulSoup从一个网页提取NCBI RefSeq和提交的GenBank组装登录号。但是,我遇到了一个问题,就是之前的代码在处理只有一个组装的基因组时不起作用,因为它会打开一个不同的页面。
为了解决这个问题,我尝试了另一种方法:
这段代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"
# Open a Chrome browser
driver = webdriver.Chrome()
# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"
# Navigate to the search URL
driver.get(search_url)
try:
# Wait for the main content to be visible
main_content = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "maincontent")))
# Find the assembly information
assembly_info = main_content.text if main_content else "Assembly information not found"
#print(assembly_info)
# Extract GenBank and RefSeq assembly IDs if the assembly widget is present
try:
assembly_table = driver.find_element(By.CLASS_NAME, "assembly-widget")
rows = assembly_table.find_elements(By.TAG_NAME, "tr")
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td")
if len(cells) == 3:
label = cells[1].text.strip()
assembly_id = cells[2].text.strip()
if label == "NCBI RefSeq assembly":
print("NCBI RefSeq assembly:", assembly_id)
elif label == "Submitted GenBank assembly":
print("Submitted GenBank assembly:", assembly_id)
except NoSuchElementException:
print("Assembly information widget not found.")
except TimeoutException:
print("Elements not found or timed out waiting for them to appear.")
# Initialize variables to store assembly IDs
genbank_assembly = None
refseq_assembly = None
# Split the assembly information into lines and iterate over them
lines = assembly_info.split("\n")
for i in range(len(lines)):
if "NCBI RefSeq assembly" in lines[i]:
refseq_assembly = lines[i+1].strip()
elif "Submitted GenBank assembly" in lines[i]:
genbank_assembly = lines[i+1].strip()
# Print the assembly IDs if found
if refseq_assembly:
print("NCBI RefSeq assembly:", refseq_assembly)
if genbank_assembly:
print("Submitted GenBank assembly:", genbank_assembly)
# Close the browser
driver.quit()
输出是
Assembly information widget not found.
NCBI RefSeq assembly: GCF_036226945.1
Submitted GenBank assembly: GCA_036226945.1
但是,这段代码打开了整个页面,所以我处理提取RefSeq和GenBank登录号。
不过,我觉得这不是一个好的方法,应该有更好的方式来实现这个目标。
我找到的另一种方法是
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"
# Open a Chrome browser
driver = webdriver.Chrome()
try:
# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"
# Navigate to the search URL
driver.get(search_url)
# Find elements containing the organism name
elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'NCBI RefSeq assembly')]") #{search_term}
if elements:
print(f"Text '{search_term}' found on the webpage.")
# Loop through elements containing the organism name
for element in elements:
# Find the parent element of the matched element
parent_element = element.find_element(By.XPATH, "..") # for sibling"following-sibling::*[1]" #for parents ".." and for grand parents "../.."
# Print the text content of the parent element
print("Parent element:")
print(parent_element.text)
else:
print(f"Text '{search_term}' not found on the webpage.")
except Exception as e:
print("An error occurred:", e)
finally:
# Quit the browser
driver.quit()
但我想用之前的方法来处理这个页面,标题:使用物种和菌株名称提取GenBank登录号的困难,使用网页抓取(使用BeautifulSoup或Selenium),这样我就可以在一个脚本中收集所有信息。
有人能给我建议一个合适的代码来实现这个吗?请帮帮我。
提前谢谢你!
1 个回答
1
我把你的代码简化了一下,得到了这段可以正常运行的代码。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"
# Open a Chrome browser
driver = webdriver.Chrome()
# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"
# Navigate to the search URL
driver.get(search_url)
wait = WebDriverWait(driver, 10)
# Extract GenBank and RefSeq assembly IDs if the assembly widget is present
try:
refseq_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='NCBI RefSeq assembly']//following::dd/span"))).text
print("NCBI RefSeq assembly:", refseq_assembly)
genbank_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='Submitted GenBank assembly']//following::dd/span"))).text
print("Submitted GenBank assembly:", genbank_assembly)
except TimeoutException:
print("Elements not found or timed out waiting for them to appear.")
# Close the browser
driver.quit()
它会输出
NCBI RefSeq assembly: GCF_036226945.1
Submitted GenBank assembly: GCA_036226945.1