Im pretty sure im using yield improperly:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from gensim import corpora, models, similarities
from collections import defaultdict
from pprint import pprint # pretty-printer
from six import iteritems
import openpyxl
import string
from operator import itemgetter
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#Creating a stoplist from file
with open('stop-word-list.txt') as f:
stoplist = [x.strip('\n') for x in f.readlines()]
corpusFileName = 'content_sample_en.xlsx'
corpusSheetName = 'content_sample_en'
class MyCorpus(object):
def __iter__(self):
wb = openpyxl.load_workbook(corpusFileName)
sheet = wb.get_sheet_by_name(corpusSheetName)
for i in range(1, (sheet.max_row+1)/2):
title = str(sheet.cell(row = i, column = 4).value.encode('utf-8'))
summary = str(sheet.cell(row = i, column = 5).value.encode('utf-8'))
content = str(sheet.cell(row = i, column = 10).value.encode('utf-8'))
yield reBuildDoc("{} {} {}".format(title, summary, content))
def removeUnwantedPunctuations(doc):
"change all (/, \, <, >) into ' ' "
newDoc = ""
for l in doc:
if l == "<" or l == ">" or l == "/" or l == "\\":
newDoc += " "
else:
newDoc += l
return newDoc
def reBuildDoc(doc):
"""
:param doc:
:return: document after being dissected to our needs.
"""
doc = removeUnwantedPunctuations(doc).lower().translate(None, string.punctuation)
newDoc = [word for word in doc.split() if word not in stoplist]
return newDoc
corpus = MyCorpus()
tfidf = models.TfidfModel(corpus, normalize=True)
In the following example you can see me trying to create a corpus from an xlsx file. Im reading from the xlsx file 3 lines which are title summary and content and appending them into a big string. my reBuildDoc() and removeUnwantedPunctuations() functions then adjust the text to my needs and in the end returns a big list of words. (for ex: [hello, piano, computer, etc... ]) in the end I yield the result but I get the following error:
Traceback (most recent call last):
File "C:/Users/Eran/PycharmProjects/tfidf/docproc.py", line 101, in <module>
tfidf = models.TfidfModel(corpus, normalize=True)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 96, in __init__
self.initialize(corpus)
File "C:\Anaconda2\lib\site-packages\gensim-0.13.1-py2.7-win-amd64.egg\gensim\models\tfidfmodel.py", line 119, in initialize
for termid, _ in bow:
ValueError: too many values to unpack
I know the error is from the yield line because I had a different yield line that worked. It looked like this:
yield [word for word in dictionary.doc2bow("{} {} {}".format(title, summary, content).lower().translate(None, string.punctuation).split()) if word not in stoplist]
It was abit messy and hard to put functionallity to it so I've changed it as you can see in the first example.
removeUnwantedPunctuationsis implemented incredibly inefficiently, particularly since you're performing atranslatecall on the result anyway. Just run the following at the top level of your codeunwanted_to_space, deletepunc = string.maketrans(r'\/<>', ' '), string.punctuation.translate(None, r'\/<>'), then changedoc = removeUnwantedPunctuations(doc).lower().translate(None, string.punctuation)todoc = doc.translate(unwanted_to_space, deletepunc).lower(). In simple tests, that reduces run time by a factor of ~10-15x (higher end for longer/less punctuated strings).unwanted_to_spacewithstring.maketrans(r'\/<>' + string.ascii_uppercase, ' ' + string.ascii_lowercase)(if it's not visible, there should be four spaces in the string that leads the second argument), which allows you to omit the call tolower(if the input was non-ASCII, you'd want truelower, but for ASCIIstr, incorporating the work in thetranslatecall is equivalent and free), getting a >20x savings on runtime. Obviously, if your inputs are small, the savings doesn't matter, but for big data, parsing input could be a big cost.