From 731f74ff3cc9d8ab14f08e5bd7b851eac4584d98 Mon Sep 17 00:00:00 2001 From: hippoz Date: Wed, 14 Apr 2021 01:15:12 +0300 Subject: [PATCH] return page as valid html and strip text --- app/app.py | 32 +++++++++++++++++++++++--------- app/templates/page.html | 11 +++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) create mode 100644 app/templates/page.html diff --git a/app/app.py b/app/app.py index 986946f..d8296c6 100644 --- a/app/app.py +++ b/app/app.py @@ -1,19 +1,20 @@ from bottle import route, run, template, request from bs4 import BeautifulSoup, Tag import time -import urllib +import urllib.request +import urllib.parse targets = { "html2_0": { "elements": [ - "div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp" + "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp" # "link" ], "flatten_elements": [ - "cite", "span", "address", "samp", "p", "div" + "div", "center", "article", "main", "footer", "header", "section" ], "allowed_attributes": { - "a": ["href", "methods", "name", "rel", "rev", "title", "urn" ], + "a": ["href", "methods", "rel", "rev", "urn" ], # name, title "base": ["href"], "dir": ["compact"], "dl": ["compact"], @@ -46,6 +47,10 @@ def parse_for_target(src, target, targets_dict): final_html = "" + # Unwrap the content from certain elements like div, so that we dont get really big nests of div's and stuff + for x in body.find_all(flatten_elements, recursive=True): + x.unwrap() + for x in body.find_all(recursive=True): # Remove the element if its not in our allowlist if (x.name not in target_tags): @@ -60,12 +65,17 @@ def parse_for_target(src, target, targets_dict): # Remove elements that have no content and no attributes if (len(x.get_text(strip=True)) == 0 and x.attrs == {}): x.extract() - - for x in body.find_all(flatten_elements, recursive=True): - x.unwrap() + continue + # Strip the text content of all elements for better looking pages in the absence of css + x.string = x.text.strip("\n\r ") for t in body: - final_html += str(t) + # Add the element to the final html string, but before that, strip it of any leading whitespace + s = str(t) + st = s.strip() + if s != st and st == "": + st = " " + final_html += st return final_html @@ -91,6 +101,10 @@ def index(): total_time_taken = round(render_page_time_taken + fetch_time_taken, 3) print(f"* process page finished - total_time_taken: { total_time_taken }s") - return parsed_page + + return template("templates/page.html", { + "page_content": parsed_page, + "page_title": "viewing page: " + request.query.url + }) run(host='localhost', port=8080) \ No newline at end of file diff --git a/app/templates/page.html b/app/templates/page.html new file mode 100644 index 0000000..2debaae --- /dev/null +++ b/app/templates/page.html @@ -0,0 +1,11 @@ + + + + + + {{ page_title }} + + + {{ !page_content }} + + \ No newline at end of file