smartrss/verb_extract.py

import nltk
import os
from fetch_exper_data import download_xml, read_xml_from_file, xml_dict_to_article_list
import spacy
from bs4 import BeautifulSoup


def extract_verb_subject(sentence):
    """
    Extracts verbs and their subjects from a sentence.

    Args:
        sentence: A string representing the sentence to be analyzed.

    Returns:
        A list of dictionaries, where each dictionary has two keys:
            - verb: The verb identified in the sentence.
            - subject: The subject of the verb.
    """
    tokens = nltk.word_tokenize(sentence)  # Tokenize the sentence
    pos_tags = nltk.pos_tag(tokens)  # Get part-of-speech tags

    verb_subject_pairs = []
    verb = None
    for token, pos_tag in pos_tags:
        if pos_tag.startswith("VB"):  # Check if token is a verb
            verb = token.lower()
        elif not verb is None and (
            pos_tag.startswith("NN") or pos_tag.startswith("PRP")
        ):  # Check for nouns or pronouns after verb
            verb_subject_pairs.append({"verb": verb, "subject": token.lower()})
            verb = None

    return verb_subject_pairs


def test_ver1():
    nltk.download("averaged_perceptron_tagger")
    nltk.download("punkt")  # Download sentence tokenizer if not already installed
    # Example usage
    sentence = "The Microsoft Surface organization exists to create iconic end-to-end experiences across hardware, software, and services that people love to use every day. We believe that products are a reflection of the people who build them, and that the right tools and infrastructure can complement the talent and passion of designers and engineers to deliver innovative products. Product level simulation models are routinely used in day-to-day decision making on design, reliability, and product features. The organization is also on a multi-year journey to deliver differentiated products in a highly efficient manner. Microsoft Azure HPC plays a vital role in enabling this vision. Below is an account of how we were able to do more with less by leveraging the power of simulation and Azure HPC. "
    verb_subject_pairs = extract_verb_subject(sentence)

    print(
        verb_subject_pairs
    )  # Output: [{'verb': 'submitted', 'subject': 'i'}, {'verb': 'improve', 'subject': 'research'}]


def extract_verb_object(sentence):
    """
    Extracts verbs and their direct objects from a sentence using dependency parsing.

    Args:
        sentence: A string representing the sentence to be analyzed.

    Returns:
        A list of dictionaries, where each dictionary has two keys:
            - verb: The verb identified in the sentence.
            - object: The direct object of the verb.
    """
    doc = nlp(sentence)  # Parse the sentence

    verb_object_pairs = []
    for token in doc:
        if token.pos_ == "VERB":  # Check if token is a verb
            for child in token.children:
                if child.dep_ in (
                    "dobj",
                    "nsubjpass",
                ):  # Check for direct object or passive subject
                    verb_object_pairs.append(
                        {"verb": token.text.lower(), "subject": child.text.lower()}
                    )
                    # Consider adding break after appending for efficiency if needed

    return verb_object_pairs


def test_ver2():
    # Example usage
    sentence1 = "I submitted a proposal that can improve future research"
    sentence2 = "The proposal was submitted by me"
    verb_object_pairs1 = extract_verb_object(sentence)
    print(
        verb_object_pairs1
    )  # Output: [{'verb': 'submitted', 'object': 'proposal'}] (may also include "improve" - research)
    # Note: This code focuses on direct objects. For a wider range of objects, consider additional dependency labels.


def remove_tags(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(["style", "script"]):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return " ".join(soup.stripped_strings)


def plural_to_singular(word):
    """
    Converts a noun to its singular form using NLTK's WordNet.

    Args:
        word: The word to be converted (assumes it's a noun).

    Returns:
        The singular form of the word if conversion is successful, otherwise None.
    """
    wnl = nltk.WordNetLemmatizer()
    singular = wnl.lemmatize(word, pos="n")  # Lemmatize with noun part-of-speech
    return (
        singular if singular != word else word
    )  # Check if word itself is the singular form


def verb_to_base(verb):
    """
    Attempts to convert a verb to its base form (lemma) using NLTK's WordNet.

    Args:
        verb: The verb to be converted.

    Returns:
        The base form of the verb if conversion is successful, otherwise the original verb.
    """
    wnl = nltk.WordNetLemmatizer()
    base_form = wnl.lemmatize(verb, pos="v")  # Lemmatize with verb part-of-speech
    return (
        base_form if base_form != verb else verb
    )  # Check if word itself is the base form


def contains_non_letters(string):
    """
    Checks if a string contains any characters other than letters (a-z and A-Z).

    Args:
        string: The string to be checked.

    Returns:
        True if the string contains non-letters, False otherwise.
    """
    for char in string:
        if not char.isalpha():
            return True
    return False


def do_extract_for_single_source(url: str, pair_lists: dict) -> dict:
    """
    Extracts verb-object pairs from a single source URL and updates a dictionary
    containing counts of those pairs.

    Args:
        url: The URL of the source to extract data from (assumed to be XML format).
        pair_lists: A dictionary that stores counts of verb-object pairs.
                    Keys are formatted as "{verb},{singular_subject}".

    Returns:
        The updated dictionary `pair_lists` with counts incremented based on
        extracted verb-object pairs from the source.
    """

    filename = "exper/raw/" + url.split("/")[-1]
    download_xml(url=url, path=filename)
    xml_dict = read_xml_from_file(filename)
    article_list = xml_dict_to_article_list(
        xml_dict, ["rss", "channel", "item"], ["title", "description"]
    )
    # print(str(article_list))
    for i in article_list.list:
        for pair in extract_verb_object(remove_tags(i.content)):
            # Remove if subject less than 3
            if len(pair["subject"]) < 3:
                continue
            # If key or value contains none letters, continue
            if contains_non_letters(pair["verb"]) or contains_non_letters(
                pair["subject"]
            ):
                continue
            tmpStrKey = "{0},{1}".format(
                verb_to_base(pair["verb"]), plural_to_singular(pair["subject"])
            )
            if tmpStrKey in pair_lists.keys():
                pair_lists[tmpStrKey] += 1
            else:
                pair_lists[tmpStrKey] = 1
    return pair_lists


if __name__ == "__main__":
    nltk.download("wordnet")
    nlp = spacy.load("en_core_web_sm")  # Load the English language model
    pair_lists = {}

    url_root = os.getenv("RSSHUB_ROOT")
    assert url_root is not None
    urllist = [
        url_root + "/cncf/blog",
        "https://cloudblog.withgoogle.com/rss",
        url_root + "/uber/blog",
    ]

    for i in urllist:
        pair_lists = do_extract_for_single_source(i, pair_lists)

    sorted_dict = dict(
        sorted(pair_lists.items(), key=lambda item: item[1], reverse=True)
    )
    for elem in sorted_dict.items():
        print("{0},{1}".format(elem[0], elem[1]))