diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..986946f --- /dev/null +++ b/app/app.py @@ -0,0 +1,96 @@ +from bottle import route, run, template, request +from bs4 import BeautifulSoup, Tag +import time +import urllib + +targets = { + "html2_0": { + "elements": [ + "div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp" + # "link" + ], + "flatten_elements": [ + "cite", "span", "address", "samp", "p", "div" + ], + "allowed_attributes": { + "a": ["href", "methods", "name", "rel", "rev", "title", "urn" ], + "base": ["href"], + "dir": ["compact"], + "dl": ["compact"], + "form": ["action", "enctype", "method"], + "html": ["version"], + "img": ["align", "alt", "ismap", "src"], + "input": ["align", "checked", "maxlength", "name", "size", "src", "type", "value"], + "link": ["href", "methods", "rel", "rev", "title", "urn"], + "menu": ["compact"], + "meta": ["content", "http-equiv", "name"], + "nextid": ["n"], + "ol": ["compact"], + "option": ["selected", "value"], + "pre": ["width"], + "select": ["multiple", "name", "size"], + "textarea": ["cols", "name", "rows"], + "ul": ["compact"] + } + } +} + +def parse_for_target(src, target, targets_dict): + current_target = targets_dict[target] + target_tags = current_target["elements"] + allowed_attrs = current_target["allowed_attributes"] + flatten_elements = current_target["flatten_elements"] + + html = BeautifulSoup(src, "html.parser") + body = html.find("body") + + final_html = "" + + for x in body.find_all(recursive=True): + # Remove the element if its not in our allowlist + if (x.name not in target_tags): + x.extract() + continue + # Strip all attributes aside from the ones we added into the allowlist + new_attrs = {} + for key, value in list(x.attrs.items()): + if (x.name in allowed_attrs) and (key in allowed_attrs[x.name]): + new_attrs[key] = value + x.attrs = new_attrs + # Remove elements that have no content and no attributes + if (len(x.get_text(strip=True)) == 0 and x.attrs == {}): + x.extract() + + for x in body.find_all(flatten_elements, recursive=True): + x.unwrap() + + for t in body: + final_html += str(t) + + return final_html + + +@route('/page/html20/view') +def index(): + print("* process page begin - target: html20(html2_0); via: http") + + print(" --> fetch page begin") + fetch_start_time = time.time() + chosen_page_content = urllib.request.urlopen(request.query.url) + fetch_end_time = time.time() + fetch_time_taken = round(fetch_end_time - fetch_start_time, 3) + print(f" --> fetch page finished - time_taken: { fetch_time_taken }s") + + print(" --> render page begin") + render_page_start_time = time.time() + parsed_page = parse_for_target(chosen_page_content, "html2_0", targets) + render_page_end_time = time.time() + render_page_time_taken = round(render_page_end_time - render_page_start_time, 3) + print(f" --> render page finished - time_taken: { render_page_time_taken }s") + + total_time_taken = round(render_page_time_taken + fetch_time_taken, 3) + + print(f"* process page finished - total_time_taken: { total_time_taken }s") + return parsed_page + +run(host='localhost', port=8080) \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt new file mode 100644 index 0000000..705db43 --- /dev/null +++ b/app/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4 +bottle