from bottle import route, run, template, request from bs4 import BeautifulSoup, Tag import time import urllib.request import urllib.parse targets = { "html2_0": { "elements": [ "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp" # "link" ], "flatten_elements": [ "div", "center", "article", "main", "footer", "header", "section" ], "allowed_attributes": { "a": ["href", "methods", "rel", "rev", "urn" ], # name, title "base": ["href"], "dir": ["compact"], "dl": ["compact"], "form": ["action", "enctype", "method"], "html": ["version"], "img": ["align", "alt", "ismap", "src"], "input": ["align", "checked", "maxlength", "name", "size", "src", "type", "value"], "link": ["href", "methods", "rel", "rev", "title", "urn"], "menu": ["compact"], "meta": ["content", "http-equiv", "name"], "nextid": ["n"], "ol": ["compact"], "option": ["selected", "value"], "pre": ["width"], "select": ["multiple", "name", "size"], "textarea": ["cols", "name", "rows"], "ul": ["compact"] } } } def parse_for_target(src, target, targets_dict): current_target = targets_dict[target] target_tags = current_target["elements"] allowed_attrs = current_target["allowed_attributes"] flatten_elements = current_target["flatten_elements"] html = BeautifulSoup(src, "html.parser") body = html.find("body") final_html = "" # Unwrap the content from certain elements like div, so that we dont get really big nests of div's and stuff for x in body.find_all(flatten_elements, recursive=True): x.unwrap() for x in body.find_all(recursive=True): # Remove the element if its not in our allowlist if (x.name not in target_tags): x.extract() continue # Strip all attributes aside from the ones we added into the allowlist new_attrs = {} for key, value in list(x.attrs.items()): if (x.name in allowed_attrs) and (key in allowed_attrs[x.name]): new_attrs[key] = value x.attrs = new_attrs # Remove elements that have no content and no attributes if (len(x.get_text(strip=True)) == 0 and x.attrs == {}): x.extract() continue # Strip the text content of all elements for better looking pages in the absence of css x.string = x.text.strip("\n\r ") for t in body: # Add the element to the final html string, but before that, strip it of any leading whitespace s = str(t) st = s.strip() if s != st and st == "": st = " " final_html += st return final_html @route('/page/html20/view') def index(): print("* process page begin - target: html20(html2_0); via: http") print(" --> fetch page begin") fetch_start_time = time.time() chosen_page_content = urllib.request.urlopen(request.query.url) fetch_end_time = time.time() fetch_time_taken = round(fetch_end_time - fetch_start_time, 3) print(f" --> fetch page finished - time_taken: { fetch_time_taken }s") print(" --> render page begin") render_page_start_time = time.time() parsed_page = parse_for_target(chosen_page_content, "html2_0", targets) render_page_end_time = time.time() render_page_time_taken = round(render_page_end_time - render_page_start_time, 3) print(f" --> render page finished - time_taken: { render_page_time_taken }s") total_time_taken = round(render_page_time_taken + fetch_time_taken, 3) print(f"* process page finished - total_time_taken: { total_time_taken }s") return template("templates/page.html", { "page_content": parsed_page, "page_title": "viewing page: " + request.query.url }) run(host='localhost', port=8080)