I have a function that checks co-occurring words in a string of text. I would like to use this function in a pandas dataframe so that I can check if there are co-occurrences within sentences of different documents. Unfortunately, the way I passed the function in a pandas dataframe seems to not work properly.
The following code checks if any word in list 'bag1' occurs near ('dist = 4') a word from list 'bag2' in the string called 'sentence'. If there is a co-occurrence, the code prints True. This code works fine.
import re
import itertools
from nltk.tokenize import word_tokenize
sentence = "The plant is growing at a rapid rate. But the beans are growing slowly."
sentence = re.sub('[^A-Za-z0-9]+', ' ', sentence).lstrip().lower()
words = word_tokenize(sentence)
bag1 = ["plant", "beans", "banana", "apple"]
bag2 = ["growing", "fast", "fruit"]
dist = 4
def get_distance(lst1, lst2, dist):
lst1 = [i for i in lst1 if i.lower() in words]
lst2 = [i for i in lst2 if i.lower() in words]
combinations = list(itertools.product(lst1, lst2))
for w1, w2 in combinations:
if w1 in words and w2 in words:
w1_indexes = [index for index, value in enumerate(words) if value == w1]
w2_indexes = [index for index, value in enumerate(words) if value == w2]
distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
if min(distances) <= dist:
print(True)
break
else:
print(False)
def main():
get_distance(bag1, bag2, dist)
main()
Here I am passing the 'get_distance' function to check if any word in list 'bag1' occurs near a word from list 'bag2' in the individual sentences of each document. If there is a co-occurrence, I would like to mark the column 'Match?' as True. Unfortunately, this code does not work properly. The problems seem to start at 'lst1 = [i for i in lst1 if i.lower() in row.tokens]' , which somehow generate and empty list. Even if I remove the line of code, there seems to be some bug with the remaining code (see descriptions below).
import pandas as pd
import re
import itertools
import nltk
from nltk.tokenize import word_tokenize
dataset = {
"document": ["doc1", 'doc2'],
"text": ['The plant is growing at a rapid rate. But the beans are growing slowly.', 'The beans are are growing fast in the region.'],
}
df = pd.DataFrame(dataset)
bag1 = ["plant", "beans", "banana", "apple"]
bag2 = ["growing", "fast", "fruit"]
dist = 4
def clean_text():
df['text'] = df.text.str.lower()
df['text'] = df.text.str.replace('\ufeff','')
df['text'] = df.text.str.strip()
def split_sentences():
global df
df["sentences"] = df["text"].apply(nltk.sent_tokenize)
df = df.explode('sentences')
def tokenize_words():
df['words_in_text'] = df['sentences'].apply(word_tokenize)
def get_distance(row, lst1, lst2, dist):
row.tokens = df["words_in_text"]
# It seems that the two lines below generate empty lists. The rest of the code does not work even if I remove the following two lines
lst1 = [i for i in lst1 if i.lower() in row.tokens]
lst2 = [i for i in lst2 if i.lower() in row.tokens]
combinations = list(itertools.product(lst1, lst2))
for w1, w2 in combinations:
if w1 in row.tokens and w2 in row.tokens:
w1_indexes = [index for index, value in enumerate(row.tokens) if value == w1]
w2_indexes = [index for index, value in enumerate(row.tokens) if value == w2]
distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
if min(distances) <= dist:
return True
break
else:
return False
#despite having a return value in the get_distance, the values in the 'Match?' column are all 'none'
def main():
clean_text()
split_sentences()
tokenize_words()
df['Match?'] = df.apply(get_distance, args=(bag1, bag2, dist), axis=1)
display(df)
if __name__ == "__main__":
main()
Clearly, I am doing something wrong when using the 'get_distance' function in a pandas dataframe. Please, let me know if you can spot the mistakes. Thank you.
get_distance()function so I imaginedf['Match?']won't have anything in it.get_distanceis still not returning a True or False value. Instead, theMatch?column shows the valuenone.