2
0
mirror of https://github.com/searx/searx synced 2025-08-29 05:18:06 +00:00

Replace every bunch of whitespaces with only one space in HTML text

This commit is contained in:
Cqoicebordel 2015-01-30 21:00:49 +01:00
parent a3d444ab85
commit 52a57ee045

View File

@ -119,6 +119,8 @@ class HTMLTextExtractor(HTMLParser):
def html_to_text(html):
html = html.replace('\n', ' ')
html = ' '.join(html.split())
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()