return page as valid html and strip text

This commit is contained in:
hippoz 2021-04-14 01:15:12 +03:00
parent 8fa1ecacae
commit 731f74ff3c
2 changed files with 34 additions and 9 deletions

View file

@ -1,19 +1,20 @@
from bottle import route, run, template, request from bottle import route, run, template, request
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
import time import time
import urllib import urllib.request
import urllib.parse
targets = { targets = {
"html2_0": { "html2_0": {
"elements": [ "elements": [
"div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp" "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
# "link" # "link"
], ],
"flatten_elements": [ "flatten_elements": [
"cite", "span", "address", "samp", "p", "div" "div", "center", "article", "main", "footer", "header", "section"
], ],
"allowed_attributes": { "allowed_attributes": {
"a": ["href", "methods", "name", "rel", "rev", "title", "urn" ], "a": ["href", "methods", "rel", "rev", "urn" ], # name, title
"base": ["href"], "base": ["href"],
"dir": ["compact"], "dir": ["compact"],
"dl": ["compact"], "dl": ["compact"],
@ -46,6 +47,10 @@ def parse_for_target(src, target, targets_dict):
final_html = "" final_html = ""
# Unwrap the content from certain elements like div, so that we dont get really big nests of div's and stuff
for x in body.find_all(flatten_elements, recursive=True):
x.unwrap()
for x in body.find_all(recursive=True): for x in body.find_all(recursive=True):
# Remove the element if its not in our allowlist # Remove the element if its not in our allowlist
if (x.name not in target_tags): if (x.name not in target_tags):
@ -60,12 +65,17 @@ def parse_for_target(src, target, targets_dict):
# Remove elements that have no content and no attributes # Remove elements that have no content and no attributes
if (len(x.get_text(strip=True)) == 0 and x.attrs == {}): if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
x.extract() x.extract()
continue
for x in body.find_all(flatten_elements, recursive=True): # Strip the text content of all elements for better looking pages in the absence of css
x.unwrap() x.string = x.text.strip("\n\r ")
for t in body: for t in body:
final_html += str(t) # Add the element to the final html string, but before that, strip it of any leading whitespace
s = str(t)
st = s.strip()
if s != st and st == "":
st = " "
final_html += st
return final_html return final_html
@ -91,6 +101,10 @@ def index():
total_time_taken = round(render_page_time_taken + fetch_time_taken, 3) total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
print(f"* process page finished - total_time_taken: { total_time_taken }s") print(f"* process page finished - total_time_taken: { total_time_taken }s")
return parsed_page
return template("templates/page.html", {
"page_content": parsed_page,
"page_title": "viewing page: " + request.query.url
})
run(host='localhost', port=8080) run(host='localhost', port=8080)

11
app/templates/page.html Normal file
View file

@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{{ page_title }}</title>
</head>
<body>
{{ !page_content }}
</body>
</html>