如何在不保存文件的情况下使用来自刮削的pdf数据?

2024-05-16 11:10:47 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图使用一个pdf后,刮它从一个网站。我想直接使用内容,而不实际保存在我的计算机上的文件

这是我到现在为止所拥有的

import libgen_scraper

if __name__ =='__main__':
    context = {}
    query = 'life of pi'
    qlist = libgen_scraper.scraper(query)

    context['Content'] = libgen_scraper.get_content(qlist)
    print(context['Content'])

libgen_.py

from bs4 import BeautifulSoup as bs
import requests

# if __name__ == '__main__':
def scraper(search_entry):
    books_list = []
    # search_entry = 'life of pi'

    search_name=search_entry.replace(' ', '+')

    url = 'http://libgen.is/search.php?req='+search_name+'&lg_topic=libgen&open=0&view=simple&res=25&phrase=1&column=def' 
    # print("Searching:", url)

    response = requests.get(url, timeout=10)
    content = bs(response.content, "html.parser")
    tables = content.findAll('table') #findinf tables
    tr_tag = tables[2].findAll('tr') #in the thrid table is all the useful info
    for tr_tag_iterator in tr_tag:
        if tr_tag_iterator is not tr_tag[0]: #except the first tr tag, all others contain info of book
            td_tag=tr_tag_iterator.findAll('td')
            a_tag=td_tag[2].findAll('a') #in the third td tag, book name is present
            a_tag=str(a_tag)
            a_tag=a_tag.lower() #converting all to lower case to compare with search entry
            if(a_tag.find(search_entry)) is not -1:
                # a_tag=a_tag[a_tag.find(search_entry):-(len(a_tag)-len(search_entry)-a_tag.find(search_entry))] #shortening the result
                a_tag=a_tag[a_tag.find(search_entry):]
                a_tag=a_tag[:-(len(a_tag)-(a_tag.find('<')))] #removing everything after '<'
                # print("Book:", a_tag)

                #corresponding author td tag
                author=str(td_tag[1])#author td tag
                author=author[author.find('author')+8:-9] #removing all junk till author name, and end tags
                #8 was added above to compensate for the word 'author'
                if author.find('<')>-1:
                    author=author[:-(len(author)-author.find('<'))]
                # print("Written by:", author)

                mirror_link=td_tag[9].findAll('a') #finding the first mirror download link
                mirror_link=str(mirror_link)
                delete_mirror_pos=mirror_link.find('title=')
                mirror_link=mirror_link[10:delete_mirror_pos-2] 
                response=requests.get(mirror_link, timeout=10)
                content=bs(response.content, "html.parser")
                mirror_h2=content.findAll('h2') 
                mirror_h2=str(mirror_h2) 
                mirror_h2_delete_pos_start=mirror_h2.find('a href=')
                mirror_h2_delete_pos_end=mirror_h2.find('>GET') 
                download_link=mirror_h2[mirror_h2_delete_pos_start+8:-(len(mirror_h2)-mirror_h2_delete_pos_end+1)] 
                print(' ')

                book={"Name":a_tag, "Author":author, "Link":download_link}
                books_list.append(book)
    # print(books_list)
    return books_list

def get_content(books_list):
    link = books_list[0].get('Link')
    print(link)
    book_download_page = requests.get(link)
    content = book_download_page.content
    print(content)
    return content

当我运行main函数时,我得到如下输出:

b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n5808 0 obj <</Linearized 1/L 840379/O 5811/E 5260/N 228/T 724170/H [ 491 553]>>\rendobj\r \r\nxref\r\n5808 9\r\n0000000016 00000 n\r\n0000001322 00000 n\r\n0000000491 00000 n\r\n0000001564 00000 n\r\n0000001779 00000 n\r\n0000002343 00000 n\r\n0000002584 00000 n\r\n0000002662 00000 n\r\n0000001044 00000 n\r\ntrailer\r\n<</Size 5817/Prev 724157/XRefStm 1044/Root 5809 0 R/Info 538 0 R/ID[<c2de00be364728e0af9d14041382a7a7><c95e9185073e8847a653474bd238ce3b>]>>\r\nstartxref\r\n0\r\n%%EOF\r\n \r\n5810 0 obj<</Length 461/Filter/FlateDecode/C 1396/L 1380/S 1347>>stream\r\nx\xdab```b``\xfe\x0c"Y_1\xf00 \x00\x0fP\x8c\x89\x81\x85\x81\xe3\x00\x03\x99\xa0#\xf5]M\x89S\xf0S\xdb\xd2e\x0f}\xd3\xd8\x8b\xd4fw\xb6\xaf\xeb`\x8d\xab\x14\\\xe3\xe0\xb2!\'5\xb3\xe0\xbb\xa1\'\xcb\xb3\xa8\xa2\x03\xb2%\xc7\xd2Yk\x0b\xb6^\xc9?\xfe0f\xb9r\xc2\xb6\xc3\xfeOl\xc3\x96\xcc\r[z\xc1\xbf\x8c\xdfA9$h\xf1*\xcdC\xa1\xaa\x87e\xa2Cod\x7f\xeed[x\\\xe7\x8bh\xd2\xe4\xb5\xb5O\xef\xd8.\xfc\xef\xbc\xe5s\xb4\x9bNf\xeb\xd6\x087\xd9*1\xdb\xe8\xe5\xf7\x9b\x97~\x8e\xdc\xaeS\xa9\xceVb\xfa-\xcd,+\xf3\x99m\xfeR\x93\xd4\xa71)\xea\xff}\x17\xa5e\xb0\x9e\xc8\xdb\xba%w\xd9\xf9\xc4\xaf\xcd\xc9O_\x14\xab\xbd\xd0\x9fr\xc0]\xf4PfyMS\xda{\x7f\xf5\xc3\x05\xa1\xb7\xcb\xbf\x1d\xa8fyl\x16\xaa\x9b\x16^\x94[*\x90b\xb6;e\xe9\xcc\x8c\xd2\x0f\xae\xec\x11e\xc7\xf6\xe6\x89\x05\x95\xab^\x8a\x9f\xd6\x90\xc8v\xd1\xaf\xbc?B\x94\xc3_\xd5\xac\xb8do\xa2J\x89\xac\x90\x9e\xa2\xe0\t\xee\xc6\xd9R\x13C\xf9\x0cy\xc5\x1a\x18\x8c\\\xe2\x82\xa6\xfa\x95o\xab\xcch\xfdW\xb6\xf4W\xce\xf2\x90\xf2\xf0\'\x91KO\xc4\x86\xd48\x84\xc5\x85\x89=\xb6\x9a\xf4\x9fk\xc3>\xfeK\x81\xb2\x89.\x91\x0f\xbb\xc4\x12\x0b\xf8\x0f\x1e`Wu\x0c\x9e\xd2\x94h\xe6o\xc1\xf0\xd3\x90\xf1\x08\xf3\x85?\xe2\x86"\xf2F\x1d\xff\xc9\x05L\x95\xcf\xe7\r\x11\xc40\nh\x00F\xf3#-\xf2#R\x00\xb3B1\x03\x83\x1f\x03\x0f\xc7\xc4g\\\x06\x0cs\x1b\x80<\xc6?\x0c\\\xaf\x03\x18\xc0e-\x83\x1d\x90\x7f\x9c\x81\xf3\xc6q \xdb\x94\x81Q\xa8\x13\xa2\x17 \xc0\x00\xa7a\xe79\r\nendstream\rendobj\r5816 0 obj<</Size 5808/Length 123/Filter/FlateDecode/DecodeParms<</Columns 4/Predictor 12>>/W[1 2 1]/Type/XRef/Index[539 5269]>>stream\r\nx\xda\xec\xd11\x01\x000\x0c\xc3\xb0\xa4\xfc)\x0c\xcfh\x8d\xc2\xfe\xea\x11\x00{\xa67\x93\xa4XD\x8f\x06\x9e\xc3sx\x0e\xcf\xe19<\x87\xe7\xf0\x1c\x9e\xc3sx\x0e\xcf\xe19<\x87\xe7\xf0\xdcsx\x0e\xcf\xe19<\x87\xe7\xf0\x1c\x9e\xc3sx\x0e\xcf\xe19<\x87\xe7\xf0\x1c\x9e{\x0e\xcf\xe19<\x87\xe7\xf0\x1c\x9e\xc3sx\x0e\xcf\xe19<\x87\xe7\xf0\x1c\x9e\xc3s\xcf\xe19<\xc7/O\x80\x01\x00\x86\xe0^m\r\nendstream\rendobj\r5809 0 obj<</Pages 510 0 R/Type/Catalog/PageLabels 508 0 R/StructTreeRoot 539 0 R/Metadata 537 0 R/PieceInfo<</MarkedPDF<</LastModified(D:20040403002334)>>>>/LastModified(D:20040403002334)/MarkInfo<</Marked true/LetterspaceFlags 0>>>>\rendobj\r5811 0 obj<</Contents 5815 0 R/Type/Page/Parent 512 0 R/Rotate 0/MediaBox[0 0 612 792]/CropBox[0 0 612 792]/Resources<</Font<</TT0 5812 0 R>>/ProcSet[/PDF/Text]/ExtGState<</GS0 5814 0 R>>>>/StructParents 0>>\rendobj\r5812 0 obj<</Type/Font/Encoding/WinAnsiEncoding/BaseFont/TimesNewRomanPSMT/FirstChar 32/LastChar 151/Subtype/TrueType/FontDescriptor 5813 0 R/Widths[250 333 408 0 0 0 0 180 333 333 0 0 250 333 250 278 500 500 500 500 500 500 500 500 500 500 278 278 564 564 564 444 0 722 667 667 722 611 556 722 722 333 389 722 611 889 722 722 556 722 667 556 611 722 722 944 722 722 611 333 0 333 469 0 0 444 500 444 500 444 333 500 500 278 278 500 278 778 500 500 500 500 333 389 278 500 500 722 500 500 444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000]>>\rendobj\r5813 0 obj<</Type/FontDescriptor/FontBBox[-568 -307 2028 1007]/FontName/TimesNewRomanPSMT/Flags 34/StemV 82/CapHeight 656/XHeight 0/Ascent 891/Descent -216/ItalicAngle 0/FontFamily(Times New Roman)/FontStretch/Normal/FontWeight 400>>\rendobj\r5814 0 obj<</Type/ExtGState/SA false/OP false/SM 0.02/op false/OPM 1>>\rendobj\r5815 0 obj<</Length 2526/Filter/FlateDecode>>stream\r\nH\x89\x9cW]\x8f\xdb8\x12|\x9f_A\xccKl`\xacH\xb2eY\xc0b\x81\xd9$\xc8\xe5\xb0\xc1\x0e6\xc6} \xc9\x03-\xd163\x8a\xa8%)\xfb\xbc\xbf\xfe\xaaII\xb6<\xb6\xf7r\xc8\x83eg\xd4lvWUW\xbf~b?\xfd\xf4\xfa\xe3\x9b\x0foY\xc8~\xfe\xf9\x97\xb7o\xd8]\xc86w\x11\x93\xec\xee\xf5\xfbOx6w\xbf,\xef^/\x97!\x8b\xd8r}\x17\x06a\x18N\xf1\x983\xf7\x98\xb0\xe5\x1e\xef\xb2\xa5aQH\x9f\x7f\xe2\xdbR\xb3(\xc6gH\x1fY\xc8\xd2p\x11dI\x96\xcd\xd9\xf2\xfb\xdd\xe7\xd1c\x9e\x97\\~\x1f/Fl\xad4\xfb7\xaf*\xf6\x91k+\xcaW\xe3\xf9\xc8\xb0_\xe5Z0\xb5fO\x92\x8d\xbf.\xff~\xf7ny\xf7\xee#2{}\xcc6\xea\xb2\x1d&\xe7\xd2\xa2\x8c\x06\xc7\xcf\xb3$\x88\xe6H\x9b\x8e\x1f\xb1\xf1\xf2\xdb\xa5\x90\xf1\xe5\x90t\xc9\x98\xe2N\xe81\x9a\xbe\x8c\xbe\x88\x82\xe9<\x8c"\x17\xfd\xfe$yiX\xa5,\xfb\xd6\x18\xcb8\xd3\x82\x17|U\n\xc6\xab\x82\x89j\xc37\x9229\xc99\r\xa6qB\xcf\x05\x8aTm\xf0\xf2N\x94\x0fL\xdaW\xe3dd\xc6\x93h\x840kY\x89\xf2\xc0\xec^\x1a+\nV"\x92\xdd\xd2\x81\x07\xae\xab/q\x9c\xba\x92\xb9\xb4g}\xda\xf4\xb8g\x93\xf6\x84I\x14D\xb3,\xcb\xfcA\xf4\x1es\xdd\xa8\xcb\x83\xc4\xb18\x83\xeb\xc9Z\xd8|\x8b\x03\x8cU\xfa\xc0\x0e\xaaa9\xaf\xa8?\xd6\x1d\xe0\xab\x11Gm\xe2\xe1\xbcO\x9c\xfd\xd1H+\x98\xd9\xf3\xb2T{\xb6\xdf\xaaR<\xb0Uc\xfb\x00\xac\x90\x86\xce\x93\xc6\xb0c\xb6\xc7"\xd3\xa3\xcf\x96\xc2\x9ef;R\x8d\xd5r\xb3\xb5\x01\x1b\xd6YV\xccn\xf1i5/\xa4\x95\xcaU\x82#\xf9\xbe\xc4>\xe1y\x9bo4k\xf3\x1d\xd1\xed\x10\xa6\xa9r\xa1-G\x9c\x9d\xd0<\x97\xfe\xc5\x16\xe1\xfd\xdb\x19\xbd\x9e\x05\xd3\xa4\xbf\xed\xe1\x81\xd1U8u\xa7\x10,\xd7\xa2\x90\xd4\xe4\x15Z\xb4\x15\xecR3|\x12\x93x\x1e\x9c]\x0e\x14\xa0T\xe8=j\xca\xc4\xd4\xb2\xaa\x84\xa6\xbb>\x0b\xd6\'t^\xa8(\n\xfat\xa4u\x98\x13{\xa5\xed\xf6\x00\xa8\xe5\x82\x1ah\x1e\x18\xa74\x15\xda\xbb\xdf\xca|\xcb>0\xcb\x11\xd4*\xb6\x12\xecw\xb5\x92\x95Q\x95Ow\xe2\x02\x0eA\xf2F7F\xa1\x8d\xef\x9b\xb2\x94\xa8\xd0\x11\x91K\xcd\x81R\x1c@i?V\xb9\x14\x95%6\xcb\xe3\xd5\xbd\\\x9c\x96\x10\x1dX\xc4\xf8\xd6\x83]\xe8\x07\xf6Q\xa1hoe\xfe\xec\x18\xf2$+\xe0O;a\x90\x95g\x01\x89\x03\xff\x0b\xc8\xf8\xb8\xc3\xec-/\x1dV\x80;\x0e\xd6h\xc6-3\x82\xb3\\5u\t\x8c\xef%(\xe4\x10y9i\'rQ\x12\x84\x8b.a@\xa4)Uc\x98i\xf4N\xeex\xe9\xf1\xb6R\x07Ci2^\xecP\x87F\x0b\xa7p\xc7\xa8\xd11j\x12\xf9\x94]\xd8\x01\x0e6Z\xed\xab\xa66\xc1=\xa3\xb0\xa8\xc0\x86ka\xd9\xa3\xdd+U<\x1c\x910?\xde\xdf\x03{\x1a\xcc\x16=\x16\x96h\xc8\xa7\xa6*\xf8\x81-\x1d\xbf\x055\xcc\xb0/\xa3_UU\x80"\xe3\xab\xf2:\xfd\x11yM\x16\xb3 \x9d\x87\xf3\xf8\xa6\xbc\xce\xae\xcb\xeb\x8bk\x0c\xa2\xa7a\x90!z\xe6g\xc7\xfd#\xd4i\xe5k\xaf\x95S\xad1Ph\xb7Z5\x9b-\x90\xe3\xa5\x0c\xe2Zq\xd2\x01v\xb9\xa3Y/\\q\xab\x03D\xd8FW\x86\x89\xdcX\xbc\x9a?

我将在django上合并这个,所以如果django中有一个方法,那就太好了。但是一个通用的python解决方案也不错

非常感谢


Tags: searchmirrortaglinkh2contentfindauthor