From 731f74ff3cc9d8ab14f08e5bd7b851eac4584d98 Mon Sep 17 00:00:00 2001
From: hippoz <hippoz@noreply.localhost>
Date: Wed, 14 Apr 2021 01:15:12 +0300
Subject: [PATCH] return page as valid html and strip text

---
 app/app.py              | 32 +++++++++++++++++++++++---------
 app/templates/page.html | 11 +++++++++++
 2 files changed, 34 insertions(+), 9 deletions(-)
 create mode 100644 app/templates/page.html
diff --git a/app/app.py b/app/app.py
index 986946f..d8296c6 100644
--- a/app/app.py
+++ b/app/app.py
@@ -1,19 +1,20 @@
 from bottle import route, run, template, request
 from bs4 import BeautifulSoup, Tag
 import time
-import urllib
+import urllib.request
+import urllib.parse
 
 targets = {
     "html2_0": {
         "elements": [
-            "div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
+            "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
             # "link"
         ],
         "flatten_elements": [
-            "cite", "span", "address", "samp", "p", "div"
+            "div", "center", "article", "main", "footer", "header", "section"
         ],
         "allowed_attributes": {
-            "a": ["href", "methods", "name", "rel", "rev", "title", "urn" ],
+            "a": ["href", "methods", "rel", "rev", "urn" ], # name, title
             "base": ["href"],
             "dir": ["compact"],
             "dl": ["compact"],
@@ -46,6 +47,10 @@ def parse_for_target(src, target, targets_dict):
 
     final_html = ""
 
+    # Unwrap the content from certain elements like div, so that we dont get really big nests of div's and stuff
+    for x in body.find_all(flatten_elements, recursive=True):
+        x.unwrap()
+
     for x in body.find_all(recursive=True):
         # Remove the element if its not in our allowlist
         if (x.name not in target_tags):
@@ -60,12 +65,17 @@ def parse_for_target(src, target, targets_dict):
         # Remove elements that have no content and no attributes
         if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
             x.extract()
-    
-    for x in body.find_all(flatten_elements, recursive=True):
-        x.unwrap()
+            continue
+        # Strip the text content of all elements for better looking pages in the absence of css
+        x.string = x.text.strip("\n\r ")
 
     for t in body:
-        final_html += str(t)
+        # Add the element to the final html string, but before that, strip it of any leading whitespace
+        s = str(t)
+        st = s.strip()
+        if s != st and st == "":
+            st = " "
+        final_html += st
 
     return final_html
 
@@ -91,6 +101,10 @@ def index():
     total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
 
     print(f"* process page finished - total_time_taken: { total_time_taken }s")
-    return parsed_page
+
+    return template("templates/page.html", {
+        "page_content": parsed_page,
+        "page_title": "viewing page: " + request.query.url
+    })
 
 run(host='localhost', port=8080)
\ No newline at end of file
diff --git a/app/templates/page.html b/app/templates/page.html
new file mode 100644
index 0000000..2debaae
--- /dev/null
+++ b/app/templates/page.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{ page_title }}</title>
+</head>
+<body>
+    {{ !page_content }}
+</body>
+</html>
\ No newline at end of file