Added classes for aimodel and article/article_list, implemented wrapper

to use aimodel to extract keyword from a article's content
2024-01-08 04:14:38 +00:00
parent 9d51bd91a8
commit 3dd7d65ee7
8 changed files with 442 additions and 386 deletions
@@ -2,6 +2,10 @@ import os
 import requests
 import xmltodict
 import pprint
+from article_list import article, article_list
+import re
+
+CLEANR = re.compile("<.*?>")


 def download_xml(url: str, path: str) -> bool:
@@ -30,8 +34,8 @@ def read_xml_from_file(path: str) -> dict:


 def xml_dict_to_article_list(
-    xml_dict: dict, tagpath: list, reserved_fields: list
-) -> list:
+    xml_dict: dict, tagpath: list, reserved_fields: list = ["title", "description"]
+) -> article_list:
    """
    Translate a xml dict struct into a article_list
    xml_dict: the payload of xml dict
@@ -41,11 +45,15 @@ def xml_dict_to_article_list(
    root = xml_dict
    for tag in tagpath:
        root = root[tag]
-    article_list = []
+    alist = article_list()
    for item in root:
        tmp_dict = {i: item[i] for i in item if i in reserved_fields}
-        article_list.append(tmp_dict)
-    return article_list
+        title = "".join(tmp_dict["title"]).strip()
+        title = re.sub(CLEANR, "", title)
+        description = "".join(tmp_dict["description"]).strip()
+        description = re.sub(CLEANR, "", description)
+        alist.append(article(title, description))
+    return alist


 if __name__ == "__main__":
@@ -56,7 +64,6 @@ if __name__ == "__main__":
    download_xml(url=url, path=filename)
    xml_dict = read_xml_from_file(filename)
    article_list = xml_dict_to_article_list(
-        xml_dict, ["rss", "channel", "item"], "title"
+        xml_dict, ["rss", "channel", "item"], ["title", "description"]
    )
-    pp = pprint.PrettyPrinter()
-    pp.pprint(article_list)
+    print(str(article_list))