我试图在网页抓取过程中从下面的脚本标签中获取一些信息,比如职位、大小、行业。有没有什么简单和可复制的方法?因为我要刮很多页
我试着把它转换成一个字符串,去掉前面和后面多余的部分,然后使用eval
,但是它显示了一个语法错误
<script>
window.gdGlobals = window.gdGlobals ||
[{
'analyticsId': "UA-2595786-1",
'analyticsUrl': "/jobview/jobs/joblisting/Teradata Corporation",
'deferredScriptType': "text/x-deferred-js",
'accessDeniedRedirectUrl': '',
'locale': 'en-US',
'env': 'prod',
'device': {
'handheld': false,
'tablet': false,
'deviceTypeId': 1,
'platformTypeId': 3,
'viewTypeId': 4
},
'page': {
'domain': "www.glassdoor.com",
'domainId': 1,
'domainSuffix': "",
'group': "JobListing",
'guid': '000001675b5fa6a78fa0ac61fe36e9b6',
'flex': true,
'section': "job-listing",
'type': "job-listing:job-listing",
'id': "job-listing:job-listing",
'med': '',
'src' : '',
'content' : '',
'campaign' : '',
'term' : '',
'state' : 'locked',
'untranslatedUrl' : 'https://www.glassdoor.com/job-listing/service-management-business-analyst-teradata-JV_IC1147311_KO0,35_KE36,44.htm?jl=3032410565'
},
'user': {
'guid': '2917b9da-9897-4a3b-90c6-da49238a5924',
'ipAddr': "70.95.16.113",
'locale': "en_US",
'country': "US",
'ipLocationId': "1147311",
'ipLocationType': "C",
'elligibleForAppBoy': false
},
'vendor': {
'fbReqPerms': ""
},
'search': {
"rawKeyword":""
},
'employer' : {
'size' : "10000--1",
'sector' : "Information Technology",
'sectorId' : "10013",
'industry' : "Computer Hardware & Software",
'industryId' : "200060",
'name':"Teradata",
'id' : "14638"
,'location': "San Diego"
,'locationId': "1147311"
,'locationType': "C"
},
'job' : {
'jobTitle' : "Service Management Business Analyst",
'city' : "",
'state' : "",
'country' : "",
'id': "3032410565",
'jobSource': "6938",
'hasPostalAddress': 0,
'hasOccupationalCategory': 1,
'hasSalaryCurrency': 1,
'hasGeoCoordinates': 1,
'category' : "10014",
'expired' : 'false'
},
'test' : {
'planoutIdList': [
, "jobViewDomain.exp_jobViewDomain_catchall"
, "savedJobsDomain.non_user_saved_jobs_catchall"
, "urgency.2018_10_15_badgeDiversity"
, "easyApplyDomain.exp_easyApplyDomain_catchall"
, "jobDetailsDomain.exp_jobDetailsDomain_catchall"
, "serpDomain.exp_serpDomain_catchall"
, "serpDomain.reviseFacetCounts_2018_11_06"
, "jxGlobalDomain.2018_11_20_exp_userReg"
, "myJobsDomain.2018_09_06_myJobsJAFilters"
, "urgency.urgency_catchall"
, "jobs-view.extractedFields"
, "jxGlobalDomain.exp_jxGlobalDomain_catchall"
, "jx_global.2018_06_25_xToSerpUrgencyBadge"
, "jobs-ux-dk-2.redirectToHome"
, "jobAlertDomain.exp_jobAlertDomain_catchAll"
],
'planoutTreatmentList': [
, "jobViewDomainDefaultTreatment"
, "savedJobsDefaultTreatment"
, "badges_sevenByThree"
, "easyApplyDefaultTreatment"
, "jobDetailsDefaultTreatment"
, "serpDefaultTreatment"
, "reviseFacetCounts_on"
, "userReg_control"
, "myJobsJAFilters_on"
, "urgency_default"
, "false"
, "jxGlobalDefaultTreatment"
, "xToSerpUrgencyBadge_on"
, "savedJobsRedirect-false"
, "jobAlertDefaultTreatment"
]
},
'staticList' : {
}
}];
window.getGdGlobals = window.getGdGlobals ||
function() {
return gdGlobals[0];
};
GD = window.GD || {};
GD.pageInfo = GD.pageInfo || {};
GD.pageInfo.pageGroup = getGdGlobals().page.group;
GD.domain = getGdGlobals().page.domain;
</script>
假设脚本位于名为
script
的python var中收益率:
正则表达式将非常脆弱,因此您可能希望比本例中更聪明
相关问题 更多 >
编程相关推荐