2

I am trying to clean my text data from the text file and I face this error:TypeError: expected string or bytes-like object.

filenames= os.listdir("/input")
raw_files = []

for filename in filenames:
    with open('/input') as myfile:
        raw_files.append(myfile.read().split())

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
global stopwords
import gensim
import re

stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")


def clean_sentences(text):
    tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

    sent_list = []
    for sent in tokens:
        sent_str = ''
        for i, word in enumerate(nltk.word_tokenize(sent)):
            # nltk doesn't handle apostrophes correctly
            if word[0] == "'":
                sent_str = sent_str[:-1]

            # only adds words and digits
            if re.sub('[a-zA-Z0-9]',"", str(word)):
                sent_str += str(word.lower() + ' ')
                sent_list.append(sent_str.strip()).apply(str)

    return str(sent_list)

# takes list of clean sentences and converts to list of tokens
def tokens_only(text):
    tokens = []

    for sentence in text:
        tokens.extend(sentence.split(" "))

        return tokens

# takes in text, cleans it, and returns lemma only
def lemma_tokens(text):
     import gensim
     tokens = tokens_only (str(clean_sentences(text)))
     return [stemmer.stem(token) for token in tokens]

all_lemma = []
all_tokens = []
all_sentences = []
all_sentences_label = []

for i, doc in enumerate(raw_files):

    # clean sentences    
    tmp_list= str(clean_sentences(doc))
    all_sentences.extend(tmp_list)
    for j in range(len(tmp_list)):
        all_sentences_label.append(filenames[i])

    # convert list of clean sentences to tokens
    tmp_list = tokens_only(tmp_list)
    all_tokens.extend(tmp_list)

    # gets root word for tokens in document
    all_lemma.extend(lemma_tokens(doc))

I am getting these errors below. Traceback (most recent call last): Traceback:

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner
  34.             response = get_response(request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  115.                 response = self.process_exception_by_middleware(e, request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  113.                 response = wrapped_callback(request, *callback_args, **callback_kwargs)

File "C:\Users\User\waqaf\waqaf\views.py" in output4
  572.      tmp_list= str(clean_sentences(doc))

File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences
  531.      tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize
  106.     return tokenizer.tokenize(text)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize
  1277.         return list(self.sentences_from_text(text, realign_boundaries))

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp>
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize
  1321.         for sl in slices:

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries
  1362.         for sl1, sl2 in _pair_iter(slices):

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter
  318.         prev = next(it)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text
  1335.         for match in self._lang_vars.period_context_re().finditer(text):

Exception Type: TypeError at /output4
Exception Value: expected string or bytes-like object

I have seen so many similar post but none of it help me solved my problem. I already used str() and apply(str) but it still doesn't work. I continue to get the error.

1 Answer 1

2

Ultimately what is passed to sent_tokenize is one of the items in raw_files, i.e. the output of myfile.read().split(), which is a list of strings. But it expects a single string.

I suggest to omit .split().

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.