I need to replace KEY from with VAL. Key is a regex like import.* and val is a string like "important". I know that this code is not good, because key is regex, but i couldn't find a solution that works.
#import stem dict
d = {}
with open("Stem rečnik.txt") as f:
for line in f:
key, val = line.split(":")
d[key.replace("\n","")] = val.replace("\n","")
#define tokenizer
def custom_tokenizer(text):
#split- space
tokens = nltk.tokenize.word_tokenize(text)
#stemmer
for key,val in d.items():
tokens=[token.replace(key,val) for token in tokens]
#remove special characters
tokens=[re.sub(r'[^a-zA-Z0-9]',"",token) for token in tokens]
return tokens
cv=CountVectorizer(tokenizer= custom_tokenizer,analyzer ='word',encoding='utf-8', min_df=0, max_df=1.0)
post_textCV= cv.fit_transform(post_text)
df=DataFrame(post_textCV.A, columns=cv.get_feature_names())
print(df.head)
So, the problem is this line here:
tokens=[token.replace(key,val) for token in tokens]
post_text.