Files
smartrss/verb_extract.py

213 lines
7.5 KiB
Python

import nltk
import os
from fetch_exper_data import download_xml, read_xml_from_file, xml_dict_to_article_list
import spacy
from bs4 import BeautifulSoup
def extract_verb_subject(sentence):
"""
Extracts verbs and their subjects from a sentence.
Args:
sentence: A string representing the sentence to be analyzed.
Returns:
A list of dictionaries, where each dictionary has two keys:
- verb: The verb identified in the sentence.
- subject: The subject of the verb.
"""
tokens = nltk.word_tokenize(sentence) # Tokenize the sentence
pos_tags = nltk.pos_tag(tokens) # Get part-of-speech tags
verb_subject_pairs = []
verb = None
for token, pos_tag in pos_tags:
if pos_tag.startswith("VB"): # Check if token is a verb
verb = token.lower()
elif not verb is None and (
pos_tag.startswith("NN") or pos_tag.startswith("PRP")
): # Check for nouns or pronouns after verb
verb_subject_pairs.append({"verb": verb, "subject": token.lower()})
verb = None
return verb_subject_pairs
def test_ver1():
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt") # Download sentence tokenizer if not already installed
# Example usage
sentence = "The Microsoft Surface organization exists to create iconic end-to-end experiences across hardware, software, and services that people love to use every day. We believe that products are a reflection of the people who build them, and that the right tools and infrastructure can complement the talent and passion of designers and engineers to deliver innovative products. Product level simulation models are routinely used in day-to-day decision making on design, reliability, and product features. The organization is also on a multi-year journey to deliver differentiated products in a highly efficient manner. Microsoft Azure HPC plays a vital role in enabling this vision. Below is an account of how we were able to do more with less by leveraging the power of simulation and Azure HPC. "
verb_subject_pairs = extract_verb_subject(sentence)
print(
verb_subject_pairs
) # Output: [{'verb': 'submitted', 'subject': 'i'}, {'verb': 'improve', 'subject': 'research'}]
def extract_verb_object(sentence):
"""
Extracts verbs and their direct objects from a sentence using dependency parsing.
Args:
sentence: A string representing the sentence to be analyzed.
Returns:
A list of dictionaries, where each dictionary has two keys:
- verb: The verb identified in the sentence.
- object: The direct object of the verb.
"""
doc = nlp(sentence) # Parse the sentence
verb_object_pairs = []
for token in doc:
if token.pos_ == "VERB": # Check if token is a verb
for child in token.children:
if child.dep_ in (
"dobj",
"nsubjpass",
): # Check for direct object or passive subject
verb_object_pairs.append(
{"verb": token.text.lower(), "subject": child.text.lower()}
)
# Consider adding break after appending for efficiency if needed
return verb_object_pairs
def test_ver2():
# Example usage
sentence1 = "I submitted a proposal that can improve future research"
sentence2 = "The proposal was submitted by me"
verb_object_pairs1 = extract_verb_object(sentence)
print(
verb_object_pairs1
) # Output: [{'verb': 'submitted', 'object': 'proposal'}] (may also include "improve" - research)
# Note: This code focuses on direct objects. For a wider range of objects, consider additional dependency labels.
def remove_tags(html):
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(["style", "script"]):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return " ".join(soup.stripped_strings)
def plural_to_singular(word):
"""
Converts a noun to its singular form using NLTK's WordNet.
Args:
word: The word to be converted (assumes it's a noun).
Returns:
The singular form of the word if conversion is successful, otherwise None.
"""
wnl = nltk.WordNetLemmatizer()
singular = wnl.lemmatize(word, pos="n") # Lemmatize with noun part-of-speech
return (
singular if singular != word else word
) # Check if word itself is the singular form
def verb_to_base(verb):
"""
Attempts to convert a verb to its base form (lemma) using NLTK's WordNet.
Args:
verb: The verb to be converted.
Returns:
The base form of the verb if conversion is successful, otherwise the original verb.
"""
wnl = nltk.WordNetLemmatizer()
base_form = wnl.lemmatize(verb, pos="v") # Lemmatize with verb part-of-speech
return (
base_form if base_form != verb else verb
) # Check if word itself is the base form
def contains_non_letters(string):
"""
Checks if a string contains any characters other than letters (a-z and A-Z).
Args:
string: The string to be checked.
Returns:
True if the string contains non-letters, False otherwise.
"""
for char in string:
if not char.isalpha():
return True
return False
def do_extract_for_single_source(url: str, pair_lists: dict) -> dict:
"""
Extracts verb-object pairs from a single source URL and updates a dictionary
containing counts of those pairs.
Args:
url: The URL of the source to extract data from (assumed to be XML format).
pair_lists: A dictionary that stores counts of verb-object pairs.
Keys are formatted as "{verb},{singular_subject}".
Returns:
The updated dictionary `pair_lists` with counts incremented based on
extracted verb-object pairs from the source.
"""
filename = "exper/raw/" + url.split("/")[-1]
download_xml(url=url, path=filename)
xml_dict = read_xml_from_file(filename)
article_list = xml_dict_to_article_list(
xml_dict, ["rss", "channel", "item"], ["title", "description"]
)
# print(str(article_list))
for i in article_list.list:
for pair in extract_verb_object(remove_tags(i.content)):
# Remove if subject less than 3
if len(pair["subject"]) < 3:
continue
# If key or value contains none letters, continue
if contains_non_letters(pair["verb"]) or contains_non_letters(
pair["subject"]
):
continue
tmpStrKey = "{0},{1}".format(
verb_to_base(pair["verb"]), plural_to_singular(pair["subject"])
)
if tmpStrKey in pair_lists.keys():
pair_lists[tmpStrKey] += 1
else:
pair_lists[tmpStrKey] = 1
return pair_lists
if __name__ == "__main__":
nltk.download("wordnet")
nlp = spacy.load("en_core_web_sm") # Load the English language model
pair_lists = {}
url_root = os.getenv("RSSHUB_ROOT")
assert url_root is not None
urllist = [
url_root + "/cncf/blog",
"https://cloudblog.withgoogle.com/rss",
url_root + "/uber/blog",
]
for i in urllist:
pair_lists = do_extract_for_single_source(i, pair_lists)
sorted_dict = dict(
sorted(pair_lists.items(), key=lambda item: item[1], reverse=True)
)
for elem in sorted_dict.items():
print("{0},{1}".format(elem[0], elem[1]))