smartrss/fetch_exper_data.py

import os
import requests
import xmltodict
import pprint
from article_list import article, article_list
import re

CLEANR = re.compile("<.*?>")


def download_xml(url: str, path: str) -> bool:
    """
    Download xml from given url and store into file
    url: request url
    path: where to store xml payload to
    """
    response = requests.get(url)
    if response.status_code != 200:
        return False
    with open(path, "w") as f:
        f.write(response.text)
    return True


def read_xml_from_file(path: str) -> dict:
    """
    Read xml from file and transmit it to a dict struct
    path: local xml file path
    """
    with open(path, "r") as f:
        data = f.read()
    xml_dict = xmltodict.parse(data)
    return xml_dict


def xml_dict_to_article_list(
    xml_dict: dict, tagpath: list, reserved_fields: list = ["title", "description"]
) -> article_list:
    """
    Translate a xml dict struct into a article_list
    xml_dict: the payload of xml dict
    tagpath: a list contains the iteration path from the root element to the itemlist, e.g ["rss", "channel", "item"]
    reserved_fields: for each item in itemlist, which fieleds will be presevered, e.g "title", "description", "pubdate"..
    """
    root = xml_dict
    for tag in tagpath:
        root = root[tag]
    alist = article_list()
    for item in root:
        tmp_dict = {i: item[i] for i in item if i in reserved_fields}
        title = "".join(tmp_dict["title"]).strip()
        title = re.sub(CLEANR, "", title)
        description = "".join(tmp_dict["description"]).strip()
        description = re.sub(CLEANR, "", description)
        alist.append(article(title, description))
    return alist


if __name__ == "__main__":
    url_root = os.getenv("RSSHUB_ROOT")
    assert url_root is not None
    url = url_root + "/yicai/brief"
    filename = "exper/raw/yicai.brief.xml"
    download_xml(url=url, path=filename)
    xml_dict = read_xml_from_file(filename)
    article_list = xml_dict_to_article_list(
        xml_dict, ["rss", "channel", "item"], ["title", "description"]
    )
    print(str(article_list))