如何从URL中删除HTML、CSS和Javascript代码

2024-04-19 21:15:23 发布

您现在位置:Python中文网/ 问答频道 /正文

我想删除所有的代码从一个网址和只得到文本。我试着用这些代码来实现这一点,但是我没有得到我想要的,因为我还有javascript代码。在

req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
soup = BeautifulSoup(html, "html.parser")
print(soup.html.string)
for string in soup.stripped_strings:
    print("string: " + repr(string))

我得到的一个例子是:

^{pr2}$

如何删除url中的所有代码?在

编辑I:

如果我用lxml.html公司名称:

req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
import lxml.html
document = lxml.html.document_fromstring(html)
print(document.text_content())

它不起作用,我请求文件:

        BBC - Homepage            window.bbcredirection={geo:true}  






      bbcRequireMap = {"jquery-1":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.7.2", "jquery-1.4":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.4", "jquery-1.9":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.9.1", "jquery-1.12":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.12.0.min", "jquery-2.2":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-2.2.0.min", "istats-1":"//nav.files.bbci.co.uk/nav-analytics/0.1.0-43/js/istats-1", "swfobject-2":"http://static.bbci.co.uk/frameworks/swfobject/0.1.10/sharedmodules/swfobject-2", "demi-1":"http://static.bbci.co.uk/frameworks/demi/0.10.1/sharedmodules/demi-1", "gelui-1":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1", "cssp!gelui-1/overlay":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1/overlay.css", "relay-1":"http://static.bbci.co.uk/frameworks/relay/0.2.6/sharedmodules/relay-1", "clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1", "canvas-clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/canvas-clock-1", "cssp!clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1.css", "jssignals-1":"http://static.bbci.co.uk/frameworks/jssignals/0.3.6/modules/jssignals-1", "jcarousel-1":"http://static.bbci.co.uk/frameworks/jcarousel/0.1.10/modules/jcarousel-1", "bump-3":"//emp.bbci.co.uk/emp/bump-3/bump-3", "ads":"http://static.bbci.co.uk/wwhp/1.126.0/modules/ads", "app":"http://static.bbci.co.uk/wwhp/1.126.0/modules/app", "compiled":"http://static.bbci.co.uk/wwhp/1.126.0/modules/compiled", "definejs":"http://static.bbci.co.uk/wwhp/1.126.0/modules/definejs", "homepage":"http://static.bbci.co.uk/wwhp/1.126.0/modules/homepage", "lib/core":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/core", "lib/module/base":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/module/base", "lib/module/manager":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/module/manager", "lib/timeInterval":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/timeInterval", "lib/util":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/util", "modules/header":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/header", "modules/images":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/images", "modules/media":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/media", "modules/video":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video", "modules/video/dataProvider":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/dataProvider", "modules/video/player":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/player", "modules/video/playlist":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/playlist", "modules/video/playlistBuilder":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/playlistBuilder", "modules/weather":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/weather", "mvt_tasks":"http://static.bbci.co.uk/wwhp/1.126.0/modules/mvt_tasks", "vendor/bower/cookie-monster/cookie-monster":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/cookie-monster/cookie-monster", "vendor/bower/fastclick/fastclick":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/fastclick/fastclick", "vendor/bower/happens/index":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/happens/index", "vendor/bower/html5shiv/html5shiv":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/html5shiv/html5shiv", "vendor/bower/imager.js/Imager":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/imager.js/Imager", "vendor/bower/jquery/jquery":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/jquery/jquery", "vendor/bower/js-breakpoints/breakpoints":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/js-breakpoints/breakpoints", "vendor/bower/modernizr/modernizr":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/modernizr/modernizr", "vendor/bower/moment/moment":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/moment/moment", "vendor/bower/promise-polyfill/Promise":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/promise-polyfill/Promise", "vendor/bower/slick.js/slick":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/slick.js/slick", "vendor/bower/slick.js/slick.min":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/slick.js/slick.min", "vendor/bower/squire/Squire":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/squire/Squire", "vendor/bower/underscore/underscore":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/underscore/underscore", "vendor/pre-built/bbc-video-experience/continuousPlay/module":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/pre-built/bbc-video-experience/continuousPlay/module"}; require({ baseUrl: 'http://static.bbci.co.uk/', paths: bbcRequireMap, waitSeconds: 30 });    /*<![CDATA[*/ if (typeof bbccookies_flag === 'undefined') { bbccookies_flag = 'ON'; } showCTA_flag = true; cta_enabled = (showCTA_flag && (bbccookies_flag === 'ON')); (function(){var m="ckns_policy",q="Thu, 01 Jan 1970 00:00:00 GMT",i={ads:true,personalisation:true,performance:true,necessary:true};function c(u){if(c.cache[u]){return c.cache[u]}var t=u.split("/"),v=[""];do{v.unshift((t.join("/")||"/"));t.pop()}while(v[0]!=="/");c.cache[u]=v;return v}c.cache={};function a(u){if(a.cache[u]){return a.cache[u]}var v=u.split("."),t=[];while(v.length&&"|co.uk|com|".indexOf("|"+v.join(".")+"|")===-1){if(v.length){t.push(v.join("."))}v.shift()}c.cache[u]=t;return t}a.cache={};function s(t,y,u){var E=[""].concat(a(window.location.hostname)),B=c(window.location.pathname),D="",w,C;for(var x=0,A=E.length;x<A;x++){w=E[x];for(var v=0,z=B.length;v<z;v++){C=B[v];D=t+"="+y+";"+(w?"domain="+w+";":"")+(C?"path="+C+";":"")+(u?"expires="+u+";":"");bbccookies.set(D,true)}}}window.bbccookies={POLICY_REFRESH_DATE_MILLIS:new Date(2015,4,21,0,0,0,0).getTime(),POLICY_EXPIRY_COOKIENAME:"ckns_policy_exp",_setEverywhere:s,cookiesEnabled:function(){var t="ckns_testcookie"+Math.floor(Math.random()*100000);this.set(t+"=1");if(this.get().indexOf(t)>-1){e(t);return true}return false},get:function(){return document.cookie},getCrumb:function(t){if(!t){return null}return decodeURIComponent(document.cookie.replace(new RegExp("(?:(?:^|.*;)\\s*"+encodeURIComponent(t).replace(/[\-\.\+\*]/g,"\\$&")+"\\s*\\=\\s*([^;]*).*$)|^.*$"),"$1"))||null},policyRequiresRefresh:function(){var u=new Date();u.setHours(0);u.setMinutes(0);u.setSeconds(0);u.setMilliseconds(0);if(bbccookies.POLICY_REFRESH_DATE_MILLIS<=u.getTime()){var t=bbccookies.getCrumb(bbccookies.POLICY_EXPIRY_COOKIENAME);if(t){t=new Date(parseInt(t));t.setYear(t.getFullYear()-1);return bbccookies.POLICY_REFRESH_DATE_MILLIS>=t.getTime()}else{return true}}else{return false}},_setPolicy:function(t){return f.apply(this,arguments)},readPolicy:function(){return l.apply(this,arguments)},_deletePolicy:function(){s(m,"",q)},_isConfirmed:function(){return n()!==null},_acceptsAll:function(){var t=l();return t&&!(j(t).indexOf("0")>-1)},_getCookieName:function(){return b.apply(this,arguments)},_showPrompt:function(){var t=((!this._isConfirmed()||this.policyRequiresRefresh())&&window.cta_enabled&&this.cookiesEnabled()&&!window.bbccookies_disable);return(window.orb&&window.orb.fig)?t&&(window.orb.fig("no")||window.orb.fig("ck")):t},setDefaultCookiesSingleDomain:function(){f.apply(this,[])},_getPolicy:this.readPolicy};function b(u){var t=(""+u).match(/^([^=]+)(?==)/);return(t&&t.length?t[0]:"")}function j(t){return""+(t.ads?1:0)+(t.personalisation?1:0)+(t.performance?1:0)}function f(x){if(typeof x==="undefined"){x=i}if(typeof arguments[0]==="string"){var u=arguments[0],w=arguments[1];if(u==="necessary"){w=true}x=l();x[u]=w}else{if(typeof arguments[0]==="object"){x.necessary=true}}var v=new Date();v.setYear(v.getFullYear()+1);bbccookies.set(m+"="+j(x)+";domain=bbc.co.uk;path=/;expires="+v.toUTCString()+";");bbccookies.set(m+"="+j(x)+";domain=bbc.com;path=/;expires="+v.toUTCString()+";");bbccookies.set(m+"="+j(x)+";domain=bbci.co.uk;path=/;expires="+v.toUTCString()+";");var t=new Date(v.getTime());t.setMonth(t.getMonth()+1);bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbc.co.uk;path=/;expires="+t.toUTCString()+";");bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbc.com;path=/;expires="+t.toUTCString()+";");bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbci.co.uk;path=/;expires="+t.toUTCString()+";");return x}function o(t){if(t===null){return null}var u=t.split("");return{ads:!!+u[0],personalisation:!!+u[1],performance:!!+u[2],necessary:true}}function n(){var t=new RegExp("(?:^|; ?)"+m+"=(\\d\\d\\d)($|;)"),u=document.cookie.match(t);if(!u){return null}return u[1]}function l(t){var u=o(n());if(!u){u=i}if(t){return u[t]}else{return u}}function e(t){return document.cookie=t+"=;expires="+q+";"}var g=!(window.bbccookies_flag==="ON"&&!bbccookies._acceptsAll()&&!window.bbccookies_disable);var k={},d={"personalisation":"ckps_.+|X-AB-iplayer-.+|ACTVTYMKR|BBC_EXAMPLE_COOKIE|BBCIplayer|BBCiPlayerM|BBCIplayerSession|BBCMediaselector|BBCPostcoder|bbctravel|CGISESSID|ed|food-view|forceDesktop|h4|IMRID|locserv|MyLang|myloc|NTABS|ttduserPrefs|V5|WEATHER|BBCScienceDiscoveryPlaylist_.+|bitratePref|correctAnswerCount|genreCookie|highestQuestionScore|incorrectAnswerCount|longestStreak|MSCSProfile|programmes-oap-expanded|quickestAnswer|score|servicePanel|slowestAnswer|totalTimeForAllFormatted|v|BBCwords|score|correctAnswerCount|highestQuestionScore|hploc|BGUID|BBCWEACITY|mstouch|myway|BBCNewsCustomisation|cbbc_anim|cbeebies_snd|bbcsr_usersx|cbeebies_rd|BBC-Latest_Blogs|zh-enc|pref_loc|m|bbcEmp.+|recs-.+|_lvd2|_lvs2|tick|_fcap_CAM1|_rcc2","performance":"ckpf_.+|optimizely.*|BBCLiveStatsClick|id|_em_.+|cookies_enabled|mbox|mbox-admin|mc_.+|omniture_unique|s_.+|sc_.+|adpolicyAdDisplayFrequency|s1|ns_session|ns_cookietest|ns_ux|NO-SA|tr_pr1|gvsurvey|bbcsurvey|si_v|sa_labels|obuid|mm_.+|mmid|mmcore.+|mmpa.+","ads":"ckad_.+|rsi_segs|c","necessary":"ckns_.+|BBC-UID|blq\\.dPref|SSO2-UID|BBC-H2-User|rmRpDetectReal|bbcComSurvey|IDENTITY_ENV|IDENTITY|IDENTITY-HTTPS|IDENTITY_SESSION|BBCCOMMENTSMODULESESSID|bbcBump.+|IVOTE_VOTE_HISTORY|pulse|BBCPG|BBCPGstat|ecos\\.dt"};function r(){var x=document.cookie.replace(/; +/g,";").split(";"),u,v=[];for(var w=0,t=x.length;w<t;w++){u=x[w];v.push(bbccookies._getCookieName(u))}return v}function h(w){var v=JSON.stringify(w);if(typeof(k[v])!=="undefined"){return k[v]}var u="";for(var t in w){if(w.hasOwnProperty(t)&&d[t]){if(w[t]===true){u+=(u?"|":"")+d[t]}}}k[v]=new RegExp("^("+(u?u:".*")+")$","i");return k[v]}bbccookies.getPolicyExpiryDateTime=function(){return bbccookies.POLICY_EXPIRY_COOKIENAME};bbccookies.purge=function(){var u=bbccookies.readPolicy(),w=r(),x;for(var v=0,t=w.length;v<t;v++){if(!bbccookies.isAllowed(w[v],u)){x=new Date();x.setTime(0);x=x.toUTCString();s(w[v],"deleted",x)}}};function p(){if(g){return}bbccookies.purge();contentLoaded(window,bbccookies.purge);if(window.addEventListener){window.addEventListener("beforeunload",bbccookies.purge,false)}else{if(window.attachEvent){window.attachEvent("onbeforeunload",bbccookies.purge)}else{window.onbeforeunload=bbccookies.purge}}}bbccookies.set=function(u,t){if(g){return document.cookie=u}var v=bbccookies._getCookieName(u);if(t||bbccookies.isAllowed(v)){return document.cookie=u}return null};bbccookies.isAllowed=function(v){var u=bbccookies.readPolicy();var t=h(u);return t.test(v)};p()})();
/*!
 * contentloaded.js
 *
 * Author: Diego Perini (diego.perini at gmail.com)

Tags: moduleshttpreturnifvarstaticfunctionjquery
2条回答

尝试lxml.html

req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
import lxml.html
document = lxml.html.document_fromstring(html)
print(document.text_content())

如果您的目标是删除<script>标记(或任何其他特定类型的标记),您可以这样做:

req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
soup = BeautifulSoup(html, "html.parser")

while soup.script:
    soup.script.replaceWith(' ')

上面的代码将用一个空格替换所有的<script>标记。例如,您可以删除所有的脚本标记,然后像以前一样从其余的标记中提取文本。在

相关问题 更多 >