96 lines
3.6 KiB
Python
96 lines
3.6 KiB
Python
|
from bottle import route, run, template, request
|
||
|
from bs4 import BeautifulSoup, Tag
|
||
|
import time
|
||
|
import urllib
|
||
|
|
||
|
targets = {
|
||
|
"html2_0": {
|
||
|
"elements": [
|
||
|
"div", "a", "address", "b", "base", "blockquote", "body", "br", "cite", "code", "dd", "dir", "dl", "dt", "em", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "img", "input", "isindex", "kbd", "li", "listing", "menu", "meta", "nextid", "ol", "option", "p", "plaintext", "pre", "samp", "select", "strong", "textarea", "title", "tt", "ul", "var", "xmp"
|
||
|
# "link"
|
||
|
],
|
||
|
"flatten_elements": [
|
||
|
"cite", "span", "address", "samp", "p", "div"
|
||
|
],
|
||
|
"allowed_attributes": {
|
||
|
"a": ["href", "methods", "name", "rel", "rev", "title", "urn" ],
|
||
|
"base": ["href"],
|
||
|
"dir": ["compact"],
|
||
|
"dl": ["compact"],
|
||
|
"form": ["action", "enctype", "method"],
|
||
|
"html": ["version"],
|
||
|
"img": ["align", "alt", "ismap", "src"],
|
||
|
"input": ["align", "checked", "maxlength", "name", "size", "src", "type", "value"],
|
||
|
"link": ["href", "methods", "rel", "rev", "title", "urn"],
|
||
|
"menu": ["compact"],
|
||
|
"meta": ["content", "http-equiv", "name"],
|
||
|
"nextid": ["n"],
|
||
|
"ol": ["compact"],
|
||
|
"option": ["selected", "value"],
|
||
|
"pre": ["width"],
|
||
|
"select": ["multiple", "name", "size"],
|
||
|
"textarea": ["cols", "name", "rows"],
|
||
|
"ul": ["compact"]
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
def parse_for_target(src, target, targets_dict):
|
||
|
current_target = targets_dict[target]
|
||
|
target_tags = current_target["elements"]
|
||
|
allowed_attrs = current_target["allowed_attributes"]
|
||
|
flatten_elements = current_target["flatten_elements"]
|
||
|
|
||
|
html = BeautifulSoup(src, "html.parser")
|
||
|
body = html.find("body")
|
||
|
|
||
|
final_html = ""
|
||
|
|
||
|
for x in body.find_all(recursive=True):
|
||
|
# Remove the element if its not in our allowlist
|
||
|
if (x.name not in target_tags):
|
||
|
x.extract()
|
||
|
continue
|
||
|
# Strip all attributes aside from the ones we added into the allowlist
|
||
|
new_attrs = {}
|
||
|
for key, value in list(x.attrs.items()):
|
||
|
if (x.name in allowed_attrs) and (key in allowed_attrs[x.name]):
|
||
|
new_attrs[key] = value
|
||
|
x.attrs = new_attrs
|
||
|
# Remove elements that have no content and no attributes
|
||
|
if (len(x.get_text(strip=True)) == 0 and x.attrs == {}):
|
||
|
x.extract()
|
||
|
|
||
|
for x in body.find_all(flatten_elements, recursive=True):
|
||
|
x.unwrap()
|
||
|
|
||
|
for t in body:
|
||
|
final_html += str(t)
|
||
|
|
||
|
return final_html
|
||
|
|
||
|
|
||
|
@route('/page/html20/view')
|
||
|
def index():
|
||
|
print("* process page begin - target: html20(html2_0); via: http")
|
||
|
|
||
|
print(" --> fetch page begin")
|
||
|
fetch_start_time = time.time()
|
||
|
chosen_page_content = urllib.request.urlopen(request.query.url)
|
||
|
fetch_end_time = time.time()
|
||
|
fetch_time_taken = round(fetch_end_time - fetch_start_time, 3)
|
||
|
print(f" --> fetch page finished - time_taken: { fetch_time_taken }s")
|
||
|
|
||
|
print(" --> render page begin")
|
||
|
render_page_start_time = time.time()
|
||
|
parsed_page = parse_for_target(chosen_page_content, "html2_0", targets)
|
||
|
render_page_end_time = time.time()
|
||
|
render_page_time_taken = round(render_page_end_time - render_page_start_time, 3)
|
||
|
print(f" --> render page finished - time_taken: { render_page_time_taken }s")
|
||
|
|
||
|
total_time_taken = round(render_page_time_taken + fetch_time_taken, 3)
|
||
|
|
||
|
print(f"* process page finished - total_time_taken: { total_time_taken }s")
|
||
|
return parsed_page
|
||
|
|
||
|
run(host='localhost', port=8080)
|