return page as valid html and strip text
This commit is contained in:
parent
8fa1ecacae
commit
731f74ff3c
2 changed files with 34 additions and 9 deletions
32
app/app.py
32
app/app.py
|
@ -1,19 +1,20 @@
|
||||||
from bottle import route, run, template, request
|
from bottle import route, run, template, request
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
import time
|
import time
|
||||||
import urllib
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
targets = {
|
targets = {
|
||||||
"html2_0": {
|
"html2_0": {
|
||||||
"elements": [
|
"elements": [
|
||||||
"div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
|
"a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
|
||||||
# "link"
|
# "link"
|
||||||
],
|
],
|
||||||
"flatten_elements": [
|
"flatten_elements": [
|
||||||
"cite", "span", "address", "samp", "p", "div"
|
"div", "center", "article", "main", "footer", "header", "section"
|
||||||
],
|
],
|
||||||
"allowed_attributes": {
|
"allowed_attributes": {
|
||||||
"a": ["href", "methods", "name", "rel", "rev", "title", "urn" ],
|
"a": ["href", "methods", "rel", "rev", "urn" ], # name, title
|
||||||
"base": ["href"],
|
"base": ["href"],
|
||||||
"dir": ["compact"],
|
"dir": ["compact"],
|
||||||
"dl": ["compact"],
|
"dl": ["compact"],
|
||||||
|
@ -46,6 +47,10 @@ def parse_for_target(src, target, targets_dict):
|
||||||
|
|
||||||
final_html = ""
|
final_html = ""
|
||||||
|
|
||||||
|
# Unwrap the content from certain elements like div, so that we dont get really big nests of div's and stuff
|
||||||
|
for x in body.find_all(flatten_elements, recursive=True):
|
||||||
|
x.unwrap()
|
||||||
|
|
||||||
for x in body.find_all(recursive=True):
|
for x in body.find_all(recursive=True):
|
||||||
# Remove the element if its not in our allowlist
|
# Remove the element if its not in our allowlist
|
||||||
if (x.name not in target_tags):
|
if (x.name not in target_tags):
|
||||||
|
@ -60,12 +65,17 @@ def parse_for_target(src, target, targets_dict):
|
||||||
# Remove elements that have no content and no attributes
|
# Remove elements that have no content and no attributes
|
||||||
if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
|
if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
|
||||||
x.extract()
|
x.extract()
|
||||||
|
continue
|
||||||
for x in body.find_all(flatten_elements, recursive=True):
|
# Strip the text content of all elements for better looking pages in the absence of css
|
||||||
x.unwrap()
|
x.string = x.text.strip("\n\r ")
|
||||||
|
|
||||||
for t in body:
|
for t in body:
|
||||||
final_html += str(t)
|
# Add the element to the final html string, but before that, strip it of any leading whitespace
|
||||||
|
s = str(t)
|
||||||
|
st = s.strip()
|
||||||
|
if s != st and st == "":
|
||||||
|
st = " "
|
||||||
|
final_html += st
|
||||||
|
|
||||||
return final_html
|
return final_html
|
||||||
|
|
||||||
|
@ -91,6 +101,10 @@ def index():
|
||||||
total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
|
total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
|
||||||
|
|
||||||
print(f"* process page finished - total_time_taken: { total_time_taken }s")
|
print(f"* process page finished - total_time_taken: { total_time_taken }s")
|
||||||
return parsed_page
|
|
||||||
|
return template("templates/page.html", {
|
||||||
|
"page_content": parsed_page,
|
||||||
|
"page_title": "viewing page: " + request.query.url
|
||||||
|
})
|
||||||
|
|
||||||
run(host='localhost', port=8080)
|
run(host='localhost', port=8080)
|
11
app/templates/page.html
Normal file
11
app/templates/page.html
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>{{ page_title }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{{ !page_content }}
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in a new issue