Added new script for extract verb-subject pairs from tech posts

2024-05-10 04:22:06 +00:00
parent 3dd7d65ee7
commit f83d04fdd4
6 changed files with 15798 additions and 821 deletions
@@ -0,0 +1,212 @@
+import nltk
+import os
+from fetch_exper_data import download_xml, read_xml_from_file, xml_dict_to_article_list
+import spacy
+from bs4 import BeautifulSoup
+
+
+def extract_verb_subject(sentence):
+    """
+    Extracts verbs and their subjects from a sentence.
+
+    Args:
+        sentence: A string representing the sentence to be analyzed.
+
+    Returns:
+        A list of dictionaries, where each dictionary has two keys:
+            - verb: The verb identified in the sentence.
+            - subject: The subject of the verb.
+    """
+    tokens = nltk.word_tokenize(sentence)  # Tokenize the sentence
+    pos_tags = nltk.pos_tag(tokens)  # Get part-of-speech tags
+
+    verb_subject_pairs = []
+    verb = None
+    for token, pos_tag in pos_tags:
+        if pos_tag.startswith("VB"):  # Check if token is a verb
+            verb = token.lower()
+        elif not verb is None and (
+            pos_tag.startswith("NN") or pos_tag.startswith("PRP")
+        ):  # Check for nouns or pronouns after verb
+            verb_subject_pairs.append({"verb": verb, "subject": token.lower()})
+            verb = None
+
+    return verb_subject_pairs
+
+
+def test_ver1():
+    nltk.download("averaged_perceptron_tagger")
+    nltk.download("punkt")  # Download sentence tokenizer if not already installed
+    # Example usage
+    sentence = "The Microsoft Surface organization exists to create iconic end-to-end experiences across hardware, software, and services that people love to use every day. We believe that products are a reflection of the people who build them, and that the right tools and infrastructure can complement the talent and passion of designers and engineers to deliver innovative products. Product level simulation models are routinely used in day-to-day decision making on design, reliability, and product features. The organization is also on a multi-year journey to deliver differentiated products in a highly efficient manner. Microsoft Azure HPC plays a vital role in enabling this vision. Below is an account of how we were able to do more with less by leveraging the power of simulation and Azure HPC. "
+    verb_subject_pairs = extract_verb_subject(sentence)
+
+    print(
+        verb_subject_pairs
+    )  # Output: [{'verb': 'submitted', 'subject': 'i'}, {'verb': 'improve', 'subject': 'research'}]
+
+
+def extract_verb_object(sentence):
+    """
+    Extracts verbs and their direct objects from a sentence using dependency parsing.
+
+    Args:
+        sentence: A string representing the sentence to be analyzed.
+
+    Returns:
+        A list of dictionaries, where each dictionary has two keys:
+            - verb: The verb identified in the sentence.
+            - object: The direct object of the verb.
+    """
+    doc = nlp(sentence)  # Parse the sentence
+
+    verb_object_pairs = []
+    for token in doc:
+        if token.pos_ == "VERB":  # Check if token is a verb
+            for child in token.children:
+                if child.dep_ in (
+                    "dobj",
+                    "nsubjpass",
+                ):  # Check for direct object or passive subject
+                    verb_object_pairs.append(
+                        {"verb": token.text.lower(), "subject": child.text.lower()}
+                    )
+                    # Consider adding break after appending for efficiency if needed
+
+    return verb_object_pairs
+
+
+def test_ver2():
+    # Example usage
+    sentence1 = "I submitted a proposal that can improve future research"
+    sentence2 = "The proposal was submitted by me"
+    verb_object_pairs1 = extract_verb_object(sentence)
+    print(
+        verb_object_pairs1
+    )  # Output: [{'verb': 'submitted', 'object': 'proposal'}] (may also include "improve" - research)
+    # Note: This code focuses on direct objects. For a wider range of objects, consider additional dependency labels.
+
+
+def remove_tags(html):
+    # parse html content
+    soup = BeautifulSoup(html, "html.parser")
+    for data in soup(["style", "script"]):
+        # Remove tags
+        data.decompose()
+    # return data by retrieving the tag content
+    return " ".join(soup.stripped_strings)
+
+
+def plural_to_singular(word):
+    """
+    Converts a noun to its singular form using NLTK's WordNet.
+
+    Args:
+        word: The word to be converted (assumes it's a noun).
+
+    Returns:
+        The singular form of the word if conversion is successful, otherwise None.
+    """
+    wnl = nltk.WordNetLemmatizer()
+    singular = wnl.lemmatize(word, pos="n")  # Lemmatize with noun part-of-speech
+    return (
+        singular if singular != word else word
+    )  # Check if word itself is the singular form
+
+
+def verb_to_base(verb):
+    """
+    Attempts to convert a verb to its base form (lemma) using NLTK's WordNet.
+
+    Args:
+        verb: The verb to be converted.
+
+    Returns:
+        The base form of the verb if conversion is successful, otherwise the original verb.
+    """
+    wnl = nltk.WordNetLemmatizer()
+    base_form = wnl.lemmatize(verb, pos="v")  # Lemmatize with verb part-of-speech
+    return (
+        base_form if base_form != verb else verb
+    )  # Check if word itself is the base form
+
+
+def contains_non_letters(string):
+    """
+    Checks if a string contains any characters other than letters (a-z and A-Z).
+
+    Args:
+        string: The string to be checked.
+
+    Returns:
+        True if the string contains non-letters, False otherwise.
+    """
+    for char in string:
+        if not char.isalpha():
+            return True
+    return False
+
+
+def do_extract_for_single_source(url: str, pair_lists: dict) -> dict:
+    """
+    Extracts verb-object pairs from a single source URL and updates a dictionary
+    containing counts of those pairs.
+
+    Args:
+        url: The URL of the source to extract data from (assumed to be XML format).
+        pair_lists: A dictionary that stores counts of verb-object pairs.
+                    Keys are formatted as "{verb},{singular_subject}".
+
+    Returns:
+        The updated dictionary `pair_lists` with counts incremented based on
+        extracted verb-object pairs from the source.
+    """
+
+    filename = "exper/raw/" + url.split("/")[-1]
+    download_xml(url=url, path=filename)
+    xml_dict = read_xml_from_file(filename)
+    article_list = xml_dict_to_article_list(
+        xml_dict, ["rss", "channel", "item"], ["title", "description"]
+    )
+    # print(str(article_list))
+    for i in article_list.list:
+        for pair in extract_verb_object(remove_tags(i.content)):
+            # Remove if subject less than 3
+            if len(pair["subject"]) < 3:
+                continue
+            # If key or value contains none letters, continue
+            if contains_non_letters(pair["verb"]) or contains_non_letters(
+                pair["subject"]
+            ):
+                continue
+            tmpStrKey = "{0},{1}".format(
+                verb_to_base(pair["verb"]), plural_to_singular(pair["subject"])
+            )
+            if tmpStrKey in pair_lists.keys():
+                pair_lists[tmpStrKey] += 1
+            else:
+                pair_lists[tmpStrKey] = 1
+    return pair_lists
+
+
+if __name__ == "__main__":
+    nltk.download("wordnet")
+    nlp = spacy.load("en_core_web_sm")  # Load the English language model
+    pair_lists = {}
+
+    url_root = os.getenv("RSSHUB_ROOT")
+    assert url_root is not None
+    urllist = [
+        url_root + "/cncf/blog",
+        "https://cloudblog.withgoogle.com/rss",
+        url_root + "/uber/blog",
+    ]
+
+    for i in urllist:
+        pair_lists = do_extract_for_single_source(i, pair_lists)
+
+    sorted_dict = dict(
+        sorted(pair_lists.items(), key=lambda item: item[1], reverse=True)
+    )
+    for elem in sorted_dict.items():
+        print("{0},{1}".format(elem[0], elem[1]))