UnisKB/apps/common/util/fork.py

176 lines
7.6 KiB
Python
Raw Normal View History

2023-12-29 10:02:23 +00:00
import copy
import logging
2023-12-27 10:33:23 +00:00
import re
2023-12-29 10:02:23 +00:00
import traceback
2023-12-27 10:33:23 +00:00
from functools import reduce
from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
2024-03-21 06:49:21 +00:00
2023-12-27 10:33:23 +00:00
import html2text as ht
2024-03-21 06:49:21 +00:00
import requests
2023-12-27 10:33:23 +00:00
from bs4 import BeautifulSoup
2024-01-25 07:25:07 +00:00
requests.packages.urllib3.disable_warnings()
2023-12-29 10:02:23 +00:00
class ChildLink:
def __init__(self, url, tag):
self.url = url
self.tag = copy.deepcopy(tag)
2023-12-27 10:33:23 +00:00
class ForkManage:
def __init__(self, base_url: str, selector_list: List[str]):
self.base_url = base_url
self.selector_list = selector_list
def fork(self, level: int, exclude_link_url: Set[str], fork_handler):
2023-12-29 10:02:23 +00:00
self.fork_child(ChildLink(self.base_url, None), self.selector_list, level, exclude_link_url, fork_handler)
2023-12-27 10:33:23 +00:00
@staticmethod
2023-12-29 10:02:23 +00:00
def fork_child(child_link: ChildLink, selector_list: List[str], level: int, exclude_link_url: Set[str],
fork_handler):
2023-12-27 10:33:23 +00:00
if level < 0:
return
2023-12-29 10:02:23 +00:00
else:
2024-04-10 10:43:43 +00:00
child_link.url = remove_fragment(child_link.url)
2023-12-29 10:02:23 +00:00
child_url = child_link.url[:-1] if child_link.url.endswith('/') else child_link.url
2024-04-10 10:43:43 +00:00
if not exclude_link_url.__contains__(child_url):
2023-12-29 10:02:23 +00:00
exclude_link_url.add(child_url)
2024-04-10 10:43:43 +00:00
response = Fork(child_link.url, selector_list).fork()
fork_handler(child_link, response)
for child_link in response.child_link_list:
child_url = child_link.url[:-1] if child_link.url.endswith('/') else child_link.url
if not exclude_link_url.__contains__(child_url):
ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler)
2023-12-27 10:33:23 +00:00
def remove_fragment(url: str) -> str:
parsed_url = urlparse(url)
modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path,
params=parsed_url.params, query=parsed_url.query, fragment=None)
return urlunparse(modified_url)
2023-12-27 10:33:23 +00:00
class Fork:
class Response:
2023-12-29 10:02:23 +00:00
def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
self.content = content
2023-12-27 10:33:23 +00:00
self.child_link_list = child_link_list
self.status = status
self.message = message
@staticmethod
2023-12-29 10:02:23 +00:00
def success(html_content: str, child_link_list: List[ChildLink]):
2023-12-27 10:33:23 +00:00
return Fork.Response(html_content, child_link_list, 200, '')
@staticmethod
def error(message: str):
return Fork.Response('', [], 500, message)
def __init__(self, base_fork_url: str, selector_list: List[str]):
base_fork_url = remove_fragment(base_fork_url)
2023-12-27 10:33:23 +00:00
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
parsed = urlsplit(base_fork_url)
query = parsed.query
2023-12-29 10:02:23 +00:00
self.base_fork_url = self.base_fork_url[:-1]
if query is not None and len(query) > 0:
self.base_fork_url = self.base_fork_url + '?' + query
self.selector_list = [selector for selector in selector_list if selector is not None and len(selector) > 0]
2023-12-29 10:02:23 +00:00
self.urlparse = urlparse(self.base_fork_url)
self.base_url = ParseResult(scheme=self.urlparse.scheme, netloc=self.urlparse.netloc, path='', params='',
query='',
fragment='').geturl()
2023-12-27 10:33:23 +00:00
def get_child_link_list(self, bf: BeautifulSoup):
pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*"
2023-12-27 10:33:23 +00:00
link_list = bf.find_all(name='a', href=re.compile(pattern))
result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
self.base_url + link.get('href'), link) for link in link_list]
result = [row for row in result if row.url.startswith(self.base_fork_url)]
2023-12-27 10:33:23 +00:00
return result
def get_content_html(self, bf: BeautifulSoup):
if self.selector_list is None or len(self.selector_list) == 0:
return str(bf)
params = reduce(lambda x, y: {**x, **y},
[{'class_': selector.replace('.', '')} if selector.startswith('.') else
{'id': selector.replace("#", "")} if selector.startswith("#") else {'name': selector} for
2023-12-27 10:33:23 +00:00
selector in
self.selector_list], {})
f = bf.find_all(**params)
return "\n".join([str(row) for row in f])
2023-12-29 10:02:23 +00:00
@staticmethod
def reset_url(tag, field, base_fork_url):
field_value: str = tag[field]
if field_value.startswith("/"):
result = urlparse(base_fork_url)
result_url = ParseResult(scheme=result.scheme, netloc=result.netloc, path=field_value, params='', query='',
fragment='').geturl()
2023-12-27 10:33:23 +00:00
else:
2023-12-29 10:02:23 +00:00
result_url = urljoin(
base_fork_url + '/' + (field_value if field_value.endswith('/') else field_value + '/'),
".")
result_url = result_url[:-1] if result_url.endswith('/') else result_url
tag[field] = result_url
2023-12-27 10:33:23 +00:00
def reset_beautiful_soup(self, bf: BeautifulSoup):
2023-12-29 10:02:23 +00:00
reset_config_list = [
{
'field': 'href',
},
{
'field': 'src',
}
]
for reset_config in reset_config_list:
field = reset_config.get('field')
tag_list = bf.find_all(**{field: re.compile('^(?!(http:|https:|tel:/|#|mailto:|javascript:)).*')})
for tag in tag_list:
self.reset_url(tag, field, self.base_fork_url)
2023-12-27 10:33:23 +00:00
return bf
@staticmethod
def get_beautiful_soup(response):
encoding = response.encoding if response.encoding is not None and response.encoding != 'ISO-8859-1' else response.apparent_encoding
2023-12-27 10:33:23 +00:00
html_content = response.content.decode(encoding)
beautiful_soup = BeautifulSoup(html_content, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
if charset != encoding:
html_content = response.content.decode(charset)
return BeautifulSoup(html_content, "html.parser")
return beautiful_soup
2023-12-27 10:33:23 +00:00
def fork(self):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
2023-12-29 10:02:23 +00:00
logging.getLogger("max_kb").info(f'fork:{self.base_fork_url}')
response = requests.get(self.base_fork_url, verify=False, headers=headers)
2023-12-27 10:33:23 +00:00
if response.status_code != 200:
2023-12-29 10:02:23 +00:00
logging.getLogger("max_kb").error(f"url: {self.base_fork_url} code:{response.status_code}")
return Fork.Response.error(f"url: {self.base_fork_url} code:{response.status_code}")
2023-12-27 10:33:23 +00:00
bf = self.get_beautiful_soup(response)
except Exception as e:
2023-12-29 10:02:23 +00:00
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
2023-12-27 10:33:23 +00:00
return Fork.Response.error(str(e))
bf = self.reset_beautiful_soup(bf)
link_list = self.get_child_link_list(bf)
content = self.get_content_html(bf)
r = ht.html2text(content)
return Fork.Response.success(r, link_list)
def handler(base_url, response: Fork.Response):
2023-12-29 10:02:23 +00:00
print(base_url.url, base_url.tag.text if base_url.tag else None, response.content)
2023-12-27 10:33:23 +00:00
2023-12-29 10:02:23 +00:00
# ForkManage('https://bbs.fit2cloud.com/c/de/6', ['.md-content']).fork(3, set(), handler)