basic page simplification (not for production use! this has an ssrf vulnerability and the code is sloppy!)

2021-04-12 17:57:04 +03:00 · 2021-04-12 17:57:04 +03:00 · ab7f7882a1
commit ab7f7882a1
parent aacb6745f6
2 changed files with 98 additions and 0 deletions
--- a/app/app.py
+++ b/app/app.py
@ -0,0 +1,96 @@
+from bottle import route, run, template, request
+from bs4 import BeautifulSoup, Tag
+import time
+import urllib
+
+targets = {
+    "html2_0": {
+        "elements": [
+            "div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
+            # "link"
+        ],
+        "flatten_elements": [
+            "cite", "span", "address", "samp", "p", "div"
+        ],
+        "allowed_attributes": {
+            "a": ["href", "methods", "name", "rel", "rev", "title", "urn" ],
+            "base": ["href"],
+            "dir": ["compact"],
+            "dl": ["compact"],
+            "form": ["action", "enctype", "method"],
+            "html": ["version"],
+            "img": ["align", "alt", "ismap", "src"],
+            "input": ["align", "checked", "maxlength", "name", "size", "src", "type", "value"],
+            "link": ["href", "methods", "rel", "rev", "title", "urn"],
+            "menu": ["compact"],
+            "meta": ["content", "http-equiv", "name"],
+            "nextid": ["n"],
+            "ol": ["compact"],
+            "option": ["selected", "value"],
+            "pre": ["width"],
+            "select": ["multiple", "name", "size"],
+            "textarea": ["cols", "name", "rows"],
+            "ul": ["compact"]
+        }
+    }
+}
+
+def parse_for_target(src, target, targets_dict):
+    current_target = targets_dict[target]
+    target_tags = current_target["elements"]
+    allowed_attrs = current_target["allowed_attributes"]
+    flatten_elements = current_target["flatten_elements"]
+
+    html = BeautifulSoup(src, "html.parser")
+    body = html.find("body")
+
+    final_html = ""
+
+    for x in body.find_all(recursive=True):
+        # Remove the element if its not in our allowlist
+        if (x.name not in target_tags):
+            x.extract()
+            continue
+        # Strip all attributes aside from the ones we added into the allowlist
+        new_attrs = {}
+        for key, value in list(x.attrs.items()):
+            if (x.name in allowed_attrs) and (key in allowed_attrs[x.name]):
+                new_attrs[key] = value
+        x.attrs = new_attrs
+        # Remove elements that have no content and no attributes
+        if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
+            x.extract()
+    
+    for x in body.find_all(flatten_elements, recursive=True):
+        x.unwrap()
+
+    for t in body:
+        final_html += str(t)
+
+    return final_html
+
+
+@route('/page/html20/view')
+def index():
+    print("* process page begin - target: html20(html2_0); via: http")
+
+    print("  --> fetch page begin")
+    fetch_start_time = time.time()
+    chosen_page_content = urllib.request.urlopen(request.query.url)
+    fetch_end_time = time.time()
+    fetch_time_taken = round(fetch_end_time - fetch_start_time, 3)
+    print(f"  --> fetch page finished - time_taken: { fetch_time_taken }s")
+
+    print("  --> render page begin")
+    render_page_start_time = time.time()
+    parsed_page = parse_for_target(chosen_page_content, "html2_0", targets)
+    render_page_end_time = time.time()
+    render_page_time_taken = round(render_page_end_time - render_page_start_time, 3)
+    print(f"  --> render page finished - time_taken: { render_page_time_taken }s")
+
+    total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
+
+    print(f"* process page finished - total_time_taken: { total_time_taken }s")
+    return parsed_page
+
+run(host='localhost', port=8080)
--- a/app/requirements.txt
+++ b/app/requirements.txt
@ -0,0 +1,2 @@
+beautifulsoup4
+bottle