Python网页抓取,使用Beautiful Soup在《商业周刊》上查找公司成立年份和地点
我的目标是从一个csv文件中提取公司名称,然后抓取该公司的成立年份和所在国家。例如,从以下公司中,我想得到“1989”和“爱尔兰”。
我已经在这个问题上努力了一段时间,参考了很多StackOverflow上的帖子来指导我,但我似乎还是无法完成。这里是我的主文件,它运行得很好,唯一奇怪的是我的表头似乎没有被识别,所以我必须用表头的第一个字母来获取第一列,这对我来说是可以接受的。我的问题是,我的网页抓取文件(在主函数下面打印的部分)没有找到并返回我想要的值。
from BeautifulSoup import BeautifulSoup
import csv
import urllib
import urllib2
import business_week_test
input_csv = "sample.csv"
output_csv = "BUSINESS_WEEK.csv"
def main():
with open(input_csv, "rb") as infile:
input_fields = ("COMPANY_NAME")
reader = csv.DictReader(infile, fieldnames = input_fields)
with open(output_csv, "wb") as outfile:
output_fields = ("COMPANY_NAME","LOCATION", "YEAR_FOUNDED")
writer = csv.DictWriter(outfile, fieldnames = output_fields)
writer.writerow(dict((h,h) for h in output_fields))
next(reader)
first_row = next(reader)
for next_row in reader:
search_term = first_row["C"]
num_words_in_comp_name = first_row["C"].split()
num_words_in_comp_name = len(num_words_in_comp_name)
result = business_week_test.bwt(search_term, num_words_in_comp_name)
first_row["LOCATION"] = result
writer.writerow(first_row)
first_row = next_row
if __name__ == "__main__":
这是网页抓取文件:
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
def bwt(article, length):
art2 = article.split()
#print(art2)
article1 = urllib.quote(article)
#print(article1)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Google Chrome')]
if (length == 1):
link = "http://investing.businessweek.com/research/stocks/private/snapshot.asp?privcapId=" + art2[0]
elif (length == 2):
link = "http://investing.businessweek.com/research/stocks/private/snapshot.asp?privcapId=" + art2[0] + "%20" + art2[1]
elif (length == 3):
#print(art2[0], art2[1],art2[2])
link = "http://investing.businessweek.com/research/stocks/private/snapshot.asp?privcapId=" + art2[0] + "%20" + art2[1] + "%20" + art2[2]
#print(link)
try:
opener.open(link)
#print("here")
except urllib2.HTTPError, err:
if err.code == 404 or err.code == 400:
#print("here", link)
return "NA"
else:
raise
resource = opener.open(link)
#print(resource)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
#print(soup)
return soup.find('div',id="bodyContent").p
1 个回答
1
这里有一段示例代码,用来获取“A&P Group Limited”公司的位置和成立年份:
import urllib2
from BeautifulSoup import BeautifulSoup
LINK = "http://investing.businessweek.com/research/stocks/private/snapshot.asp?privcapId=1716794"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Google Chrome')]
soup = BeautifulSoup(opener.open(LINK))
location = soup.find('div', {'itemprop': 'address'}).findAll('p')[-1].text
founded = soup.find('span', {'itemprop': "foundingDate"}).text
print location, founded
输出结果是:
United Kingdom 1971
希望这对你有帮助。