如何使用Mechanize从Google的搜索结果中获取图像

2024-04-20 10:53:52 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图解析谷歌图像搜索结果的HTML,并获得图像的原始链接

到目前为止,我成功地编写了一个Python代码,使用Python的Mechanize和BeautifulSoup获取Google搜索的HTML

查看Google的搜索结果HTML源代码,我发现Google正在使用类rg_meta将原始图像URL的双重编码存储在一个div中,但我从Mechanize收到的HTML不包含任何此类。事实上,整个新网页是通过Mechanize返回的

我知道谷歌的图像搜索API,但我需要用这种方式解析HTML。我做错了什么?我可以将Mechanize屏蔽为Chrome或其他浏览器吗

这是我尝试的一个片段。它没有返回任何内容:

import urllib
import mechanize
from bs4 import BeautifulSoup
from urlparse import urlparse

search = "cars"
browser = mechanize.Browser()
browser.set_proxies({"https": "10.0.2.88:3128"})
browser.set_handle_robots(False)
browser.addheaders = [('User-agent','Mozilla')]

html = browser.open("https://www.google.co.in/search?&source=lnms&tbm=isch&sa=X&q="+search+"&oq="+search)
htmltext=html.read()
print htmltext    
img_urls = []
formatted_images = []
soup = BeautifulSoup(htmltext)
#results = soup.findAll("a")
results = soup.findAll("div", { "class" : "rg_meta" })
print results

Tags: 图像importdivbrowsersearchhtmlgoogleresults
3条回答
import mechanize
br = mechanize.Browser()
br.open(<yoursitehere>)
images = re.findall("src=\"[^\"]{8,240}", br.response().read()) 
for i in images: print i
br.close()

您需要对结果进行一些过滤,并根据特定站点的HTML修改RE

为了使用requestsbeautifulsoup库刮取全分辨率图像URL,需要使用regex从页面源代码中刮取数据

基本解释:

# find all <script> tags:
soup.select('script')
# match images data via regex:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# match desired images (full res size) via regex:

# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps() it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)

matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                    matched_images_data_json)
# Extract and decode them using bytes() and decode():
for fixed_full_res_image in matched_google_full_resolution_images:
    original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
    original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')

代码和full example in the online IDE也将图像下载到文件夹:

import requests, lxml, re, json
from bs4 import BeautifulSoup


headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": "pexels cat",
    "tbm": "isch", 
    "hl": "en",
    "ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')


def get_images_data():

    print('\nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
        source = google_image.select_one('.fxgdke').text
        link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
        print(f'{title}\n{source}\n{link}\n')

    # this steps could be refactored to a more compact
    all_script_tags = soup.select('script')

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/pdZOnW/3
    matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ', '.join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(', ')

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                       removed_matched_google_images_thumbnails)


    print('\nDownloading Google Full Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)



get_images_data()


      -
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...

Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...

Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''

或者,您可以通过使用SerpApi中的Google Images API来实现相同的功能。这是一个免费的付费API

这种情况的不同之处在于,您不必使用正则表达式来匹配和提取页面源代码中所需的数据,相反,您只需迭代结构化JSON即可更快地获得所需的数据

要集成以实现目标的代码:

import os, json # json for pretty output
from serpapi import GoogleSearch

def get_google_images():
    params = {
      "api_key": os.getenv("API_KEY"),
      "engine": "google",
      "q": "pexels cat",
      "tbm": "isch"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))


get_google_images()

       -
'''
[
...
  {
    "position": 100, # img number
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
    "source": "pexels.com",
    "title": "Close-up of Cat · Free Stock Photo",
    "link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
    "original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
    "is_product": false
  }
]
'''

另外,我写了一篇更深入的博文,内容是关于如何刮Google Imageshow to reduce the chance of being blocked while web scraping search engines

Disclaimer, I work for SerpApi.

感谢您的尝试,但我必须切换到urllib2来解决此问题, 下面的代码正在解析google搜索页面的图像链接

search = search.replace(" ","%20")
  site= "http://www.google.co.in/search?q="+search+"&tbm=isch&tbs=isz:l"
  hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
         'Accept-Encoding': 'none',
         'Accept-Language': 'en-US,en;q=0.8',
         'Connection': 'keep-alive'}
  QtGui.qApp.processEvents()
  req = urllib2.Request(site, headers=hdr)

  try:
      QtGui.qApp.processEvents()
      page = urllib2.urlopen(req)
  except urllib2.HTTPError, e:
      print e.fp.read()  
  QtGui.qApp.processEvents()
  content = page.read()
  #print content
  soup = BeautifulSoup(content)
  results = soup.findAll("a")
  linkarray = soup.find_all(attrs={"class": "rg_meta"})
  #print linkarray
  refer_rl=[]
  total=len(linkarray)
  i=0
  for divs in linkarray:
    i=i+1
    stri=str(divs)
    refer_url=stri.split('%3B')
    try:
        QtGui.qApp.processEvents()
        url=urllib.unquote(refer_url[2]).decode('utf8') 
        url=urllib.unquote(url).decode('utf8') 
        #os.system('wget '+url)
        #f = open('links.txt', 'a')
        #f.write(url+'\n')
        form.textBrowser.append(url)
        form.progressBar.setProperty("value", i*100/total)
        time.sleep(0.05)

    except:
        continue
  #os.system('aria2c -i links.txt -x 16')
  #os.system('rm links.txt')
  print "All good, you can download now"

相关问题 更多 >