Counting words¶
Because what's a parallel computing demo without counting words?
In [5]:
from __future__ import print_function
Some utilitiles for excluding commmon phrases and normalizing words
In [6]:
import re
non_word = re.compile(r'[\W\d]+', re.UNICODE)
def normalize_word(word):
"""normalize a word
simply strips non-word characters and case
"""
word = word.lower()
word = non_word.sub('', word)
return word
common_words = {
'the','of','and','in','to','a','is','it','that','which','as','on','by',
'be','this','with','are','from','will','at','you','not','for','no','have',
'i','or','if','his','its','they','but','their','one','all','he','when',
'than','so','these','them','may','see','other','was','has','an','there',
'more','we','footnote', 'who', 'had', 'been', 'she', 'do', 'what',
'her', 'him', 'my', 'me', 'would', 'could', 'said', 'am', 'were', 'very',
'your', 'did', 'not',
}
def yield_words(filename):
"""returns a generator of words in a file"""
import io
with io.open(filename, encoding='latin-1') as f:
for line in f:
for word in line.split():
word = normalize_word(word)
if word:
yield word
A function that reads a file, and returns a dictionary
with string keys of phrases of n words,
whose values
In [7]:
def ngrams(filename, n=1):
"""compute ngram counts for the contents of a file"""
word_iterator = yield_words(filename)
counts = {}
def _count_gram(gram):
common = sum(word in common_words for word in gram)
if common > n / 2.0:
# don't count ngrams that are >= 50% common words
return
sgram = ' '.join(gram)
counts.setdefault(sgram, 0)
counts[sgram] += 1
gram = []
# get the first word
while len(gram) < n:
try:
word = next(word_iterator)
if not word:
continue
except StopIteration:
return counts
else:
gram.append(word)
_count_gram(gram)
while True:
try:
word = next(word_iterator)
except StopIteration:
break
else:
gram.append(word)
gram.pop(0)
_count_gram(gram)
return counts
In [8]:
%%writefile cathat.txt
the cat in the hat is a cat whose hat is big.
Overwriting cathat.txt
In [9]:
ngrams('cathat.txt', 1)
Out[9]:
{u'big': 1, u'cat': 2, u'hat': 2, u'whose': 1}
In [10]:
ngrams('cathat.txt', 2)
Out[10]:
{u'a cat': 1,
u'cat in': 1,
u'cat whose': 1,
u'hat is': 2,
u'is big': 1,
u'the cat': 1,
u'the hat': 1,
u'whose hat': 1}
Now fetch some interesting data from Project Gutenberg:
In [11]:
try:
from urllib.request import urlretrieve # py3
except ImportError:
from urllib import urlretrieve # py2
davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"
if not os.path.exists('davinci.txt'):
# download from project gutenberg
print("Downloading Da Vinci's notebooks from Project Gutenberg")
urlretrieve(davinci_url, 'davinci.txt')
In [12]:
import sys
def print_common(freqs, n=10):
"""Print the n most common keys by count."""
words, counts = freqs.keys(), freqs.values()
items = zip(counts, words)
items.sort(reverse=True)
justify = 0
for (count, word) in items[:n]:
justify = max(justify, len(word))
for (count, word) in items[:n]:
print(word.rjust(justify), count)
sys.stdout.flush()
In [13]:
# Run the serial version
print("Serial word frequency count:")
%time counts = ngrams('davinci.txt', 1)
print_common(counts, 10)
Serial word frequency count: CPU times: user 1.61 s, sys: 16.8 ms, total: 1.63 s
Wall time: 1.64 s
light 852
eye 591
same 536
shadow 507
body 456
between 446
water 425
seen 415
leonardo 414
into 402
Let's split the file
In [15]:
# split the davinci.txt into one file per engine:
text = open('davinci.txt').read()
lines = text.splitlines()
nlines = len(lines)
n = 10
block = nlines//n
for i in range(n):
chunk = lines[i*block:(i+1)*(block)]
with open('davinci%i.txt' % i, 'w') as f:
f.write('\n'.join(chunk))
In [17]:
import os
cwd = os.path.abspath(os.getcwd())
fnames = [ os.path.join(cwd, 'davinci%i.txt' % i) for i in range(n)]
In [16]:
from IPython import parallel
rc = parallel.Client()
In [29]:
view = rc.load_balanced_view()
eall = rc[:]
eall.push(dict(
non_word=non_word,
yield_words=yield_words,
common_words=common_words,
normalize_word=normalize_word,
))
Out[29]:
<AsyncResult: _push>
Exercise: parallel ngrams¶
Write a version of ngrams that runs in parallel, rejoining the results into a single count dict.
In [ ]:
def ngrams_parallel(view, fnames, n=1):
"""Compute ngrams in parallel
view - An IPython View
fnames - The filenames containing the split data.
"""
pass
In [22]:
%load ../soln/ngrams.py
In [30]:
print("Parallel ngrams")
%time pcounts = ngrams_parallel(view, fnames, 3)
print_common(pcounts, 10)
Parallel ngrams CPU times: user 403 ms, sys: 80.3 ms, total: 483 ms
Wall time: 1.36 s light and shade
98
the same way 44
the luminous body 33
between the eye 31
the space between 29
pen and ink 29
leonardo da vinci 28
the solar rays 27
the right hand 27
space between the 27
A bit more data¶
Download some Project Gutenberg samples from ntlk (avoid rate-limiting on PG itself)
In [25]:
gutenberg_samples = 'http://nltk.github.com/nltk_data/packages/corpora/gutenberg.zip'
if not os.path.isdir('gutenberg'):
if not os.path.exists('gutenberg.zip'):
urlretrieve(gutenberg_samples, 'gutenberg.zip')
!unzip gutenberg.zip
import glob
gutenberg_files = glob.glob(os.path.abspath(os.path.join('gutenberg', '*.txt')))
# remove the bible, because it's too big relative to the rest
gutenberg_files.remove(os.path.abspath(os.path.join('gutenberg', 'bible-kjv.txt')))
In [26]:
ls gutenberg
README austen-sense.txt bryant-stories.txt chesterton-ball.txt edgeworth-parents.txt shakespeare-caesar.txt whitman-leaves.txt austen-emma.txt bible-kjv.txt burgess-busterbrown.txt chesterton-brown.txt melville-moby_dick.txt shakespeare-hamlet.txt austen-persuasion.txt blake-poems.txt carroll-alice.txt chesterton-thursday.txt milton-paradise.txt shakespeare-macbeth.txt
In [31]:
print("Parallel ngrams across several books")
%time pcounts = ngrams_parallel(view, gutenberg_files, 3)
print()
print_common(pcounts, 10)
pcounts = ngrams_parallel(view, gutenberg_files, 4)
print()
print_common(pcounts, 10)
Parallel ngrams across several books CPU times: user 1.55 s, sys: 229 ms, total: 1.78 s
Wall time: 5.69 s
a great deal
175
i dare say 107
farmer browns boy 88
the sperm whale 86
the same time 84
i dont know 76
two or three 74
a few minutes 73
the white whale 71
mr and mrs 71
at the same time
76
a great deal of 66
for the first time 48
in a low voice 36
i should like to 36
out of the room 34
of the sperm whale 31
much obliged to you 29
i beg your pardon 28
at the same moment 26