diff --git a/content/text_preprocessing.py b/content/text_preprocessing.py deleted file mode 100644 index cdfa3ad0..00000000 --- a/content/text_preprocessing.py +++ /dev/null @@ -1,188 +0,0 @@ -import pandas as pd -import argparse -import numpy as np -import re # (https://docs.python.org/3/library/re.html) for tokenising textual data -import string # (https://docs.python.org/3/library/string.html) for string operations - -# Creating the random instance -rng = np.random.default_rng() - -class TextPreprocess: - """Text Preprocessing for a Natural Language Processing model.""" - - - def cleantext(self, df, text_column, remove_stopwords = True, remove_punc = True): - """Function to clean text data by removing stopwords, tags and punctuation. - - Parameters - ---------- - df : pandas dataframe - The dataframe housing the input data. - text_column : str - Column in dataframe whose text is to be cleaned. - remove_stopwords : bool - if True, remove stopwords from text - remove_punc : bool - if True, remove punctuation suymbols from text - - Returns - ------- - Numpy array - Cleaned text. - - """ - data = df - # converting all characters to lowercase - data[text_column] = data[text_column].str.lower() - - # List of common stopwords taken from https://gist.github.com/sebleier/554280 - stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", - "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", - "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", - "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", - "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", - "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", - "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", - "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", - "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", - "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", - "your", "yours", "yourself", "yourselves" ] - - def remove_stopwords(data, column): - data[f'{column} without stopwords'] = data[column].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)])) - return data - - def remove_tags(string): - result = re.sub('<*>','',string) - return result - - # remove html tags and brackets from text - if remove_stopwords: - data_without_stopwords = remove_stopwords(data, text_column) - data_without_stopwords[f'clean_{text_column}']= data_without_stopwords[f'{text_column} without stopwords'].apply(lambda cw : remove_tags(cw)) - if remove_punc: - data_without_stopwords[f'clean_{text_column}'] = data_without_stopwords[f'clean_{text_column}'].str.replace('[{}]'.format(string.punctuation), ' ', regex = True) - - X = data_without_stopwords[f'clean_{text_column}'].to_numpy() - - return X - - def split_data (self, X, y, split_percentile): - """Function to split data into training and testing data. - - Parameters - ---------- - X : Numpy Array - Contains textual data. - y : Numpy Array - Contains target data. - split_percentile : int - Proportion of training to testing data. - - - Returns - ------- - Tuple - Contains numpy arrays of test and training data. - - """ - y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y))) - arr_rand = rng.random(X.shape[0]) - split = arr_rand < np.percentile(arr_rand, split_percentile) - X_train = X[split] - y_train = y[split] - X_test = X[~split] - y_test = y[~split] - - return (X_train, y_train, X_test, y_test) - - - def sent_tokeniser (self, x): - """Function to split text into sentences. - - Parameters - ---------- - x : str - piece of text - - Returns - ------- - list - sentences with punctuation removed. - - """ - sentences = re.split(r'(?