I recently developed a Python program that makes an inverted index out of terms in a certain document. I now want to create position postings, such as
to, 993427:
⟨ 1, 6: ⟨7, 18, 33, 72, 86, 231⟩;
2, 5: ⟨1, 17, 74, 222, 255⟩; 4, 5: ⟨8, 16, 190, 429, 433⟩; 5, 2: ⟨363, 367⟩;
7, 3: ⟨13, 23, 191⟩; …⟩
I know the code is not complete as described above, I'm just trying to implement functionality.
from pprint import pprint as pp
from collections import Counter
import pprint
import re
import sys
import string
import fileinput
try:
reduce
except:
from functools import reduce
try:
raw_input
except:
raw_input = input
def readIn(fileglob): #Reads in multiple files and strips punctation/uppercase.
texts, words = {}, set()
for txtfile in (fileglob):
with open(txtfile, 'r') as splitWords:
txt = splitWords.read().lower().split()
txt = str(txt)
txt = re.findall(r'\w+', txt)
words |= set(txt)
texts[txtfile.split('\\')[-1]] = txt
return texts, words
def search(indexes): # Inverted index, based off the book and the web.
return reduce(set.intersection,
(index[word] for word in indexes),
set(texts.keys()))
def getWordBins(posOfWords):
cnt = Counter()
for word in posOfWords:
cnt[posOfWords] += 1
return cnt
def main(fileList, topWords):
tempArray = []
for x in range(1,len(fileList)):
tempArray.append(fileList[x])
texts, words = readIn(tempArray)
index = {word:set(txt
for txt, wrds in texts.items() if word in wrds)
for word in words}
test =({k + " " + str(len(v)) + " " + str(sorted(v)) for k,v in index.items()})
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
#Writes out the information requested to a ".idx" file.
doc = open("document.idx", "w")
doc.write("# INPUT DOCUMENT REFERENCE LEGEND\n")
for fileNumber in range(1, len(fileList)):
doc.write(str(fileNumber) + "\t" + fileList[fileNumber] + "\n")
doc.write("# INVERTED INDEX RESULTS\n")
tempTest = []
for x in test:
tempTest.append(x.split(" "))
for x in tempTest:
tempStr = ""
for y in x:
tempStr += y + "\t"
doc.write(tempStr + "\n")
doc.close
main(sys.argv, sys.argv)
This is what I have so far, the only new functionality is the getWordBins function, and the loop:
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
Now, what happens when I try to run the code is this:
Traceback (most recent call last):
File "Intro3.py", line 82, in <module>
main(sys.argv, sys.argv)
File "Intro3.py", line 60, in main
posWord = getWordBins(txt)
File "Intro3.py", line 41, in getWordBins
cnt[posOfWords] += 1
TypeError: unhashable type: 'dict'
Any guidance with this troubling error is gladly received. It is not a dictionary, so why the error? Thanks for your time!
dictionarycannot be hashed and therefore cannot be used as akeyto anotherdictionary(or itself for that matter!).posOfWords? Type? Also, even asetis not hashable, because asetis mutable type. And, whatever is a mutable type, cannot be hashed, hence cannot be stored in a dictionary.set()--frozenset()-- is hashable.