import os import requests import xmltodict import pprint from article_list import article, article_list import re CLEANR = re.compile("<.*?>") def download_xml(url: str, path: str) -> bool: """ Download xml from given url and store into file url: request url path: where to store xml payload to """ response = requests.get(url) if response.status_code != 200: return False with open(path, "w") as f: f.write(response.text) return True def read_xml_from_file(path: str) -> dict: """ Read xml from file and transmit it to a dict struct path: local xml file path """ with open(path, "r") as f: data = f.read() xml_dict = xmltodict.parse(data) return xml_dict def xml_dict_to_article_list( xml_dict: dict, tagpath: list, reserved_fields: list = ["title", "description"] ) -> article_list: """ Translate a xml dict struct into a article_list xml_dict: the payload of xml dict tagpath: a list contains the iteration path from the root element to the itemlist, e.g ["rss", "channel", "item"] reserved_fields: for each item in itemlist, which fieleds will be presevered, e.g "title", "description", "pubdate".. """ root = xml_dict for tag in tagpath: root = root[tag] alist = article_list() for item in root: tmp_dict = {i: item[i] for i in item if i in reserved_fields} title = "".join(tmp_dict["title"]).strip() title = re.sub(CLEANR, "", title) description = "".join(tmp_dict["description"]).strip() description = re.sub(CLEANR, "", description) alist.append(article(title, description)) return alist if __name__ == "__main__": url_root = os.getenv("RSSHUB_ROOT") assert url_root is not None url = url_root + "/yicai/brief" filename = "exper/raw/yicai.brief.xml" download_xml(url=url, path=filename) xml_dict = read_xml_from_file(filename) article_list = xml_dict_to_article_list( xml_dict, ["rss", "channel", "item"], ["title", "description"] ) print(str(article_list))