<p>这是一个基本的刮刀,将标题分为公司和职位。你知道吗</p>
<pre class="lang-py prettyprint-override"><code>import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def extract_jobs(soup: BeautifulSoup) -> list:
titles = soup.select('.storylink')
hiring_re = re.compile('\s+(is)?\s+(hiring|seeking|looking)\s+(for)?', flags=re.IGNORECASE)
jobs = []
for el in titles:
title = el.text.strip()
m = hiring_re.search(title)
if not m:
continue
company = title[:m.start()].strip()
offer = title[m.end():].strip().title()
jobs.append({
'company': company,
'wants': offer,
})
return jobs
url = 'https://news.ycombinator.com/jobs'
soup = make_soup(url)
jobs = extract_jobs(soup)
pprint(jobs)
</code></pre>
<p>输出:</p>
<pre><code> {'company': 'Mino Games (YC W11)', 'wants': 'Game Developers In Montreal'},
{'company': 'BuildZoom (YC W13)', 'wants': '– Help Us Un-Break Construction'},
{'company': 'Streak – CRM for Gmail (YC S11)', 'wants': 'In Vancouver'},
{'company': 'ZeroCater (YC W11)', 'wants': 'A Director Of Engineer In Sf'},
{'company': 'UpCodes (YC S17)', 'wants': 'Engineers To Automate Compliance For Architects'},
{'company': 'Tech Nonprofit Upsolve (YC W19)', 'wants': 'A Software Engineer'},
...
</code></pre>