1

I scraped the content of a website via Python and the requests library and I tried to clean out all the html and javascript.

from lxml.html.clean import Cleaner
import lxml.html as html
text = html.document_fromstring(r.text).text_content()
cleaner = Cleaner(kill_tags=['noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)
text = cleaner.clean_html(text)
text = ' '.join(text.split())

But I'm still getting a lot of stuff that looks like this (*** replacing identifying info):

var STYLEID = \'9\', STATICURL = \'static/\', IMGDIR = \'comiis_xzs19lou\', VERHASH = \'wRx\', charset = \'gbk\',  cookiepre = \'bdRb_a91d_\', cookiedomain = \'.***\', cookiepath = \'/\', showusercard = \'1\', attackevasive = \'0\', disallowfloat = \'login|newthread\', creditnotice = \'****\', defaultstyle = \'\', REPORTURL = \'***==\', SITEURL = \'http://****/\', JSPATH = \'data/cache/\', CSSPATH = \'data/cache/style_\', DYNAMICURL = \'\'; body{background:#EBDFC5;}

and

var fid = parseInt(\'12\'), tid = parseInt(\'2474591\'); zoomstatus = parseInt(1);var imagemaxwidth = \'600\';var aimgcount = new Array(); #framesbO97X { margin:0px !important;border:0px !important;}#portal_block_1251 { margin:0px !important;border:0px !important;}#portal_block_1251 .dxb_bc { margin:0px !important;}#frameJo3fOn { margin:0px !important;border:0px !important;}#portal_block_1252 { margin:0px !important;border:0px !important;}#portal_block_1252 .dxb_bc { margin:0px !important;}#framerU9V5m { margin:0px !important;border:0px !important;}#portal_block_1253 { margin:0px !important;border:0px !important;}#portal_block_1253 .dxb_bc { margin:0px !important;}#frameRCK99J { margin:10px 0px 5px !important;}#frameJ7YGtB { margin:0px !important;border:0px !important;}#portal_block_1255 { margin:0px !important;border:0px !important;}#portal_block_1255 .dxb_bc { margin:0px !important;}#framerVqc7m { margin:0px !important;border:0px !important;}#portal_block_1256 { margin:0px !important;border:0px !important;}#portal_block_1256 .dxb_bc { margin:0px !important;}#frameFW5eQe { margin:0px !important;border:0px !important;}#frameKBtrA1 { margin:0px !important;border:0px !important;}#portal_block_1257 { margin:0px !important;border:0px !important;}#portal_block_1257 .dxb_bc { margin:0px !important;}#portal_block_1258 { margin:0px !important;border:0px !important;}#portal_block_1258 .dxb_bc { margin:0px !important;}

and

function succeedhandle_followmod(url, msg, values) { var fObj = $(\'followmod_\'+values[\'fuid\']); if(values[\'type\'] == \'add\') { fObj.innerHTML = \***\'; fObj.href = \'home.php?mod=spacecp&ac=follow&op=del&fuid=\'+values[\'fuid\']; } else if(values[\'type\'] == \'del\') { fObj.innerHTML = \***\'; fObj.href = \'home.php?mod=spacecp&ac=follow&op=add&hash=6a62d013&fuid=\'+values[\'fuid\']; } } _***(null, $C("t_f", null, "td"), "", "***", "***"); var rel_tid = "2474591"; var rel_title = "%C9%BD%B6%AB%CA%AF%BB%AF%BE%AD%C0%ED%B7%EB%B6%AB%C7%E0%B4%FE%B2%B6%EE%BF%D1%BA%C6%DA%BC%E4%CB%C0%CD%F6%A1%AA%A1%AA%CA%C7%B7%F1%B1%BB%C9%BD%B6%AB%CA%A1%BC%EC%B2%EC%D4%BA%BC%EC%B2%EC%B3%A4%CE%E2%C5%F4%B7%C9%C3%F0%BF%DA"; var rel_reltid = "0"; var rel_prepos = ""; var my_siteid = "7149150"; var rel_uid = "0"; var rel_views = "3909"; var rel_replies = "11"; var rel_page = "1"; var rel_show = "0"; _attachEvent(window, \'load\', getForbiddenFormula, document); function getForbiddenFormula() { var toGetForbiddenFormulaFIds = function () { ajaxget(\'plugin.php?id=cloudsearch&formhash=6a62d013\'); }; var a = document.body.getElementsByTagName(\'a\'); for(var i = 0;i document.documentElement.clientWidth) { $(\'***').style.cssFloat = \'right\'; $(\'***').style.left = \'auto\'; $(\'***\').style.right = 0; } else { $(\'***\').style.cssFloat = \'left\'; $(\'***\').style.left = (qrleft) + \'px\'; $(\'***\').style.right = \'auto\'; } } _attachEvent(window, \'scroll\', function () { ***; }) _attachEvent(window, \'load\', function() { ***; }, document); #scrolltop { display: none; } ul#navmenu ul { display: none; position: absolute; left: -233px; bottom: 5px; } ul#navmenu li:hover ul ul, ul#navmenu li.iehover ul ul, { display: none; } ul#navmenu li:hover ul, ul#navmenu ul li:hover ul, ul#navmenu ul ul li:hover ul, ul#navmenu li.iehover ul, ul#navmenu ul li.iehover ul, ul#navmenu ul ul li.iehover ul { display: block; } #jz52top a {margin: 6px 0;} #jz52top { visibility: visible; right: 10px; } #jz52topa { visibility: hidden;} #jz52top, #jz52top a { border: none;} #jz52top { position: fixed; bottom: 40px; display: block; width: 40px; background: none repeat scroll 0% 0% transparent; border: 0px #cdcdcd solid; border-radius: 3px; border-top: 0; cursor: pointer; } #jz52top:hover { text-decoration: none; } #jz52top a { display: block; width: 40px; height: 40px; padding: 0; line-height: 12px; text-align: center; color: #787878; text-decoration: none; background: #00a398 url(\'source/plugin/jz52_top/template/jz52top.png\') no-repeat 0 0; border-top: 0px #cdcdcd solid; } a.jz52topa:hover { background-position: -40px 0px !important;} a.replyfast { background-position: 0 -40px !important; } a.replyfast:hover { background-position: -40px -40px !important;} a.returnlist { background-position: 0 -80px !important; } a.returnlist:hover { background-position: -40px -80px !important;} a.returnboard { background-position: -80px -240px !important; } a.returnboard:hover { background-position: -120px -240px !important;} a.jzqr { background-position: 0 -120px !important; } a.jzqr:hover { background-position: -40px -120px !important;} a.jzwx { background-position: 0 -320px !important; } a.jzwx:hover { background-position: -40px -320px !important;} a.jzkf { background-position: -80px 0px !important; } a.jzkf:hover { background-position: -120px -0px !important;} a.jzfx { background-position: -80px -40px !important; } a.jzfx:hover { background-position: -120px -40px !important;} .jzfxn { background: #fff !important; width: 231px !important; height: 260px !important; } a.jzlast { background-position: -80px -80px !important; } a.jzlast:hover { background-position: -120px -80px !important;} a.jznext { background-position: -80px -120px !important; } a.jznext:hover { background-position: -120px -120px !important;} a.jzsct { background-position: 0px -160px !important; } a.jzsct:hover { background-position: -40px -160px !important;} a.jzscb { background-position: -80px -160px !important; } a.jzscb:hover { background-position: -120px -160px !important;} a.jzqqq { background-position: 0px -200px !important; } a.jzqqq:hover { background-position: -40px -200px !important;} a.jzwo { background-position: -80px -200px !important; } a.jzwo:hover { background-position: -120px -200px !important;} a.jzzdy { background-position: 0px -240px !important; } a.jzzdy:hover { background-position: -40px -240px !important;} a.jzfbzt { background-position: 0px -280px !important; } a.jzfbzt:hover { background-position: -40px -280px !important;} #jzqrn { background: #fff !important; width: 231px !important; height: 260px !important; } #jzqrn { border: 1px solid rgb(210, 210, 210); } #jzqrn p { font-size: 15px; padding-bottom: 15px; text-align: center; color: #999; font-family: Microsoft YaHei; } #jzwon { background: #fff !important; width: 231px !important; height: 260px !important; } #jzwon { border: 1px solid rgb(210, 210, 210); } #jzfxn { border: 1px solid rgb(210, 210, 210); } #jzfxn h3 { height: 23px; background: none repeat scroll 0% 0% rgb(250, 250, 250); border-bottom: 1px solid rgb(236, 236, 236); padding: 10px 0px 0px 10px; } #jzfxn .bdsharebuttonbox { padding: 13px 0px 0px 20px; } #jzfxn .bdsharebuttonbox a, #jzfxn .bdsharebuttonbox .bds_more { float: left; font-size: 12px; padding-left: 25px; line-height: 16px; text-align: left; height: 16px; background: url("***") no-repeat scroll 0px 0px ; background-repeat: no-repeat; cursor: pointer; margin: 6px 6px 6px 0px; text-indent: 0; overflow: hidden; width: 68px; } #jzfxn .bdsharebuttonbox .bds_qzone { background-position: 0px -52px !important; } #jzfxn .bdsharebuttonbox .bds_tsina { background-position: 0px -104px !important; } #jzfxn .bdsharebuttonbox .bds_tqq { background-position: 0px -260px !important; } #jzfxn .bdsharebuttonbox .bds_renren { background-position: 0px -208px !important; } #jzfxn .bdsharebuttonbox .bds_tqf { background-position: 0px -364px !important; } #jzfxn .bdsharebuttonbox .bds_tieba { background-position: 0px -728px !important; } #jzfxn .bdsharebuttonbox .bds_sqq { background-position: 0px -2652px !important; } #jzfxn .bdsharebuttonbox .bds_hi { background-position: 0px -416px !important; } #jzfxn .bdsharebuttonbox .bds_isohu { background-position: 0px -3016px !important; } #jzfxn .bdsharebuttonbox .bds_weixin { background-position: 0px -1612px !important; } #jzfxn .bdsharebuttonbox .bds_t163 { background-position: 0px -832px !important; } #jzfxn .bdsharebuttonbox .bds_tsohu { background-position: 0px -520px !important; } #jzfxn .bdsharebuttonbox .bds_baidu { background-position: 0px -2600px !important; } #jzfxn .bdsharebuttonbox .bds_qq { background-position: 0px -624px !important; } #jz52top a b { visibility: hidden; font-weight: normal; } // JavaScript Document function goTopEx(){ var obj=document.getElementById("goTopBtn"); function getScrollTop(){ return document.documentElement.scrollTop || document.body.scrollTop; } function setScrollTop(value){ if(document.documentElement.scrollTop){ document.documentElement.scrollTop=value; }else{ document.body.scrollTop=value; } } window.onscroll=function(){getScrollTop()>0?obj.style.display="":obj.style.display="none"; var h=document.body.scrollHeight - getScrollTop() - obj.offsetTop - obj.offsetHeight; obj.style.bottom=0+"px";

Any idea how to get rid of all of that?

2 Answers 2

1

You need to clean <script> tags if you want to get rid of javascript. The only thing <noscript> tags are for is displaying an image or text if the browser has scripts disabled.

cleaner = Cleaner(kill_tags=['script', 'noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)
Sign up to request clarification or add additional context in comments.

Comments

0

My mistake, it was a typo in my code, but thanks for the hint! Correct code:

from lxml.html.clean import Cleaner
import lxml.html as html
text = html.document_fromstring(r.text)
cleaner = Cleaner(kill_tags=['script', 'noscript', 'img', 'a', 'h1'], remove_tags=['p'], style=True)
cleaner(text)
text = text.text_content()
text = ' '.join(text.split())

2 Comments

Remember, SO isn't a forum so don't treat answers as replies. You should only post your own answer if you're going to accept it as correct.
Yes I will, but I can only accept my own answer after 48 hours.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.