简易刮板

#global variable start = 'https://www.govinfo.gov/wssearch/getContentDetail?packageId=' dash = '-' urlSitemap="https://www.govinfo.gov/sitemap/PLAW_sitemap_index.xml" old_xml=requests.get(urlSitemap) print (old_xml) new_xml= io.BytesIO(old_xml.content).read() final_xml=BeautifulSoup(new_xml) linkToBeFound = final_xml.findAll('loc') for loc in linkToBeFound: urlPLmap=loc.text old_xmlPLmap=requests.get(urlPLmap) print(old_xmlPLmap) new_xmlPLmap= io.BytesIO(old_xmlPLmap.content).read() final_xmlPLmap=BeautifulSoup(new_xmlPLmap) linkToBeFound2 = final_xmlPLmap.findAll('loc') for pls in linkToBeFound2: argh = pls.text.find('PLAW') theWanted = pls.text[argh:] thisShallWork =eval(requests.get(start + theWanted).text) print(requests.get(start + theWanted)) dict1 = (thisShallWork['download']) finaldict = (dict1['modslink'])[2:] print(finaldict) url2='https://' + finaldict try: old_xml4=requests.get(url2) print(old_xml4) new_xml4= io.BytesIO(old_xml4.content).read() final_xml4=BeautifulSoup(new_xml4) references = final_xml4.findAll('identifier',{'type': 'Statute citation'}) for sec in references: if sec.text == "106 Stat. 4845": Print(dash * 20) print(sec.text) Print(dash * 20) sec313 = open('sec313info.txt','a') sec313.write("\n") sec313.write(pls.text + '\n') sec313.close() except: print('error at: ' + url2)

1条回答

网友

1楼 · 发布于 2024-05-15 23:10:55

不知道我为什么花这么长时间在这上面，但我确实做到了。你的代码真的很难看穿。所以我从这个开始，我把它分成两部分，从网站地图上获取链接，然后是其他的东西。我也把一些位分解成不同的函数。这是检查约2个网址在我的机器上每秒这似乎是正确的。这样更好（你可以和我争论这个部分）。在

每次写入后不必重新打开和关闭输出文件
删除了相当一部分不需要的代码
给你的变量起了更好的名字（这并不能提高速度，但请这样做，尤其是当你需要帮助的时候）
最重要的是。。。一旦你把这一切都分解了，你就可以很清楚地知道，拖累你的是等待请求，这是相当标准的web抓取，你可以考虑多线程来避免等待。一旦进入多线程，分解代码的好处可能也会变得更加明显。在

# returns sitemap links
def get_links(s):
    old_xml = requests.get(s)
    new_xml = old_xml.text
    final_xml = BeautifulSoup(new_xml, "lxml")
    return final_xml.findAll('loc')

# gets the final url from your middle url and looks through it for the thing you are looking for
def scrapey(link):
    link_id = link[link.find("PLAW"):]
    r = requests.get('https://www.govinfo.gov/wssearch/getContentDetail?packageId={}'.format(link_id))
    print(r.url)
    try:
        r = requests.get("https://{}".format(r.json()["download"]["modslink"][2:]))
        print(r.url)
        soup = BeautifulSoup(r.text, "lxml")
        references = soup.findAll('identifier', {'type': 'Statute citation'})
        for ref in references:
            if ref.text == "106 Stat. 4845":
                return r.url
        else:
            return False
    except:
        print("bah" + r.url)
        return False


sitemap_links_el = get_links("https://www.govinfo.gov/sitemap/PLAW_sitemap_index.xml")
sitemap_links = map(lambda x: x.text, sitemap_links_el)
nlinks_el = map(get_links, sitemap_links)
links = [num.text for elem in nlinks_el for num in elem]



with open("output.txt", "a") as f:
    for link in links:
        url = scrapey(link)
        if url is False:
            print("no find")
        else:
            print("found on: {}".format(url))
            f.write("{}\n".format(url))

相关问题更多 >

编程相关推荐

热门问题

热门文章

简易刮板

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >