3dd7d65ee7
to use aimodel to extract keyword from a article's content
70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
import os
|
|
import requests
|
|
import xmltodict
|
|
import pprint
|
|
from article_list import article, article_list
|
|
import re
|
|
|
|
CLEANR = re.compile("<.*?>")
|
|
|
|
|
|
def download_xml(url: str, path: str) -> bool:
|
|
"""
|
|
Download xml from given url and store into file
|
|
url: request url
|
|
path: where to store xml payload to
|
|
"""
|
|
response = requests.get(url)
|
|
if response.status_code != 200:
|
|
return False
|
|
with open(path, "w") as f:
|
|
f.write(response.text)
|
|
return True
|
|
|
|
|
|
def read_xml_from_file(path: str) -> dict:
|
|
"""
|
|
Read xml from file and transmit it to a dict struct
|
|
path: local xml file path
|
|
"""
|
|
with open(path, "r") as f:
|
|
data = f.read()
|
|
xml_dict = xmltodict.parse(data)
|
|
return xml_dict
|
|
|
|
|
|
def xml_dict_to_article_list(
|
|
xml_dict: dict, tagpath: list, reserved_fields: list = ["title", "description"]
|
|
) -> article_list:
|
|
"""
|
|
Translate a xml dict struct into a article_list
|
|
xml_dict: the payload of xml dict
|
|
tagpath: a list contains the iteration path from the root element to the itemlist, e.g ["rss", "channel", "item"]
|
|
reserved_fields: for each item in itemlist, which fieleds will be presevered, e.g "title", "description", "pubdate"..
|
|
"""
|
|
root = xml_dict
|
|
for tag in tagpath:
|
|
root = root[tag]
|
|
alist = article_list()
|
|
for item in root:
|
|
tmp_dict = {i: item[i] for i in item if i in reserved_fields}
|
|
title = "".join(tmp_dict["title"]).strip()
|
|
title = re.sub(CLEANR, "", title)
|
|
description = "".join(tmp_dict["description"]).strip()
|
|
description = re.sub(CLEANR, "", description)
|
|
alist.append(article(title, description))
|
|
return alist
|
|
|
|
|
|
if __name__ == "__main__":
|
|
url_root = os.getenv("RSSHUB_ROOT")
|
|
assert url_root is not None
|
|
url = url_root + "/yicai/brief"
|
|
filename = "exper/raw/yicai.brief.xml"
|
|
download_xml(url=url, path=filename)
|
|
xml_dict = read_xml_from_file(filename)
|
|
article_list = xml_dict_to_article_list(
|
|
xml_dict, ["rss", "channel", "item"], ["title", "description"]
|
|
)
|
|
print(str(article_list))
|