For循环：进程可以很好地迭代列表中大约一半的项，但是会很快停止吗？

import urllib2,sys,os from bs4 import BeautifulSoup,NavigableString from string import punctuation as p from multiprocessing import Pool import re, nltk import requests reload(sys) os.chdir('U:/Fall 2015/ENGL 3XX') sys.setdefaultencoding('utf8') #============================================================================== # Scraping and cleaning one speech from Obama to show the method works #============================================================================== obama_4427_url = 'http://www.millercenter.org/president/obama/speeches/speech-4427' obama_4427 = urllib2.urlopen(obama_4427_url).read() obama_4427 = BeautifulSoup(obama_4427) # find the speech itself within the HTML obama_4427 = obama_4427.find('div',{'id': 'transcript'},{'class': 'displaytext'}) # obama_4427_div.text removes extraneous characters (e.g. '<br/>') obama_4427 = obama_4427.text.lower() # for further text analysis, remove punctuation punctuation = re.compile('[{}]+'.format(re.escape(p))) # obama_4427_nopunct = [line.decode('utf-8').strip() for line in obama_4427_html.readlines()] obama_4427 = punctuation.sub('', obama_4427) obama_4427 = obama_4427.replace('—',' ') obama_4427 = obama_4427.replace('transcript','') # divide obama_4427_str_processed into individual words words = obama_4427.split(' ') #============================================================================== # Cleaning links begins below, so that we can process all 911 speeches through processURL() #============================================================================== url = 'http://www.millercenter.org/president/speeches' url2 = 'http://www.millercenter.org' conn = urllib2.urlopen(url) html = conn.read() miller_center_soup = BeautifulSoup(html) links = miller_center_soup.find_all('a') linklist = [tag.get('href') for tag in links if tag.get('href') is not None] # remove all items in list that don't contain 'speeches' linkslist = [_ for _ in linklist if re.search('speeches',_)] del linkslist[0:2] # concatenate 'http://www.millercenter.org' with end of speech links every_link_dups = [url2 + end_link for end_link in linkslist] # remove duplicates seen = set() every_link = [] # no duplicates array for l in every_link_dups: if l not in seen: every_link.append(l) seen.add(l) # list of presidents (print(len(set(presidents))) = 43 total) presidents_dups = [l[l.find('president/')+len('president/'):] for l in every_link if 'president' in l] presidents_dups = [l[0:l.find('/')] for l in presidents_dups] set2 = set() presidents = [] for l in presidents_dups: if l not in set2: presidents.append(l) seen.add(l) presidents = sorted(presidents) # the following two lines - now commented out - were used to identify duplicates in the original every_link array # import collections # print [l for l, count in collections.Counter(every_link).items() if count > 1] # define a function to clean & store speeches from 'every_link' repository def processURL(l): open_url = urllib2.urlopen(l).read() item_soup = BeautifulSoup(open_url) item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'}) item_str = item_div.text.lower() item_str_processed = punctuation.sub('',item_str) item_str_processed_final = item_str_processed.replace('—',' ') splitlink = l.split("/") president = splitlink[4] speech_num = splitlink[-1].split("-")[1] filename = "{0}_{1}".format(president, speech_num) return filename, item_str_processed_final # returning a tuple # right now, this loop only works for 423 speeches - where are the remaining ones? for l in every_link: filename, content = processURL(l) # tuple unpacking with open(filename, 'w') as f: f.write(content)

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章