Counting words¶

Because what's a parallel computing demo without counting words?

In [5]:

from __future__ import print_function

Some utilitiles for excluding commmon phrases and normalizing words

In [6]:

import re
non_word = re.compile(r'[\W\d]+', re.UNICODE)

def normalize_word(word):
    """normalize a word
    
    simply strips non-word characters and case
    """
    word = word.lower()
    word = non_word.sub('', word)
    return word

common_words = {
'the','of','and','in','to','a','is','it','that','which','as','on','by',
'be','this','with','are','from','will','at','you','not','for','no','have',
'i','or','if','his','its','they','but','their','one','all','he','when',
'than','so','these','them','may','see','other','was','has','an','there',
'more','we','footnote', 'who', 'had', 'been',  'she', 'do', 'what',
'her', 'him', 'my', 'me', 'would', 'could', 'said', 'am', 'were', 'very',
'your', 'did', 'not',
}

def yield_words(filename):
    """returns a generator of words in a file"""
    import io
    with io.open(filename, encoding='latin-1') as f:
        for line in f:
            for word in line.split():
                word = normalize_word(word)
                if word:
                    yield word

A function that reads a file, and returns a dictionary with string keys of phrases of n words, whose values

In [7]:

def ngrams(filename, n=1):
    """compute ngram counts for the contents of a file"""
    word_iterator = yield_words(filename)
    counts = {}
    def _count_gram(gram):
        common = sum(word in common_words for word in gram)
        if common > n / 2.0:
            # don't count ngrams that are >= 50% common words
            return
        sgram = ' '.join(gram)
        counts.setdefault(sgram, 0)
        counts[sgram] += 1
    
    gram = []
    
    # get the first word
    while len(gram) < n:
        try:
            word = next(word_iterator)
            if not word:
                continue
        except StopIteration:
            return counts
        else:
            gram.append(word)
    
    _count_gram(gram)

    while True:
        try:
            word = next(word_iterator)
        except StopIteration:
            break
        else:
            gram.append(word)
            gram.pop(0)
            _count_gram(gram)
    return counts

In [8]:

%%writefile cathat.txt
the cat in the hat is a cat whose hat is big.

Overwriting cathat.txt

In [9]:

ngrams('cathat.txt', 1)

Out[9]:

{u'big': 1, u'cat': 2, u'hat': 2, u'whose': 1}

In [10]:

ngrams('cathat.txt', 2)

Out[10]:

{u'a cat': 1,
 u'cat in': 1,
 u'cat whose': 1,
 u'hat is': 2,
 u'is big': 1,
 u'the cat': 1,
 u'the hat': 1,
 u'whose hat': 1}

Now fetch some interesting data from Project Gutenberg:

In [11]:

try: 
    from urllib.request import urlretrieve # py3
except ImportError:
    from urllib import urlretrieve # py2

davinci_url = "http://www.gutenberg.org/cache/epub/5000/pg5000.txt"

if not os.path.exists('davinci.txt'):
    # download from project gutenberg
    print("Downloading Da Vinci's notebooks from Project Gutenberg")
    urlretrieve(davinci_url, 'davinci.txt')

In [12]:

import sys

def print_common(freqs, n=10):
    """Print the n most common keys by count."""
    words, counts = freqs.keys(), freqs.values()
    items = zip(counts, words)
    items.sort(reverse=True)
    justify = 0
    for (count, word) in items[:n]:
        justify = max(justify, len(word))
    
    for (count, word) in items[:n]:
        print(word.rjust(justify), count)
    sys.stdout.flush()

In [13]:

# Run the serial version
print("Serial word frequency count:")
%time counts = ngrams('davinci.txt', 1)
print_common(counts, 10)

Serial word frequency count:
CPU times: user 1.61 s, sys: 16.8 ms, total: 1.63 s

Wall time: 1.64 s
   light 852
     eye 591
    same 536
  shadow 507
    body 456
 between 446
   water 425
    seen 415
leonardo 414
    into 402

Let's split the file

In [15]:

# split the davinci.txt into one file per engine:
text = open('davinci.txt').read()
lines = text.splitlines()
nlines = len(lines)
n = 10

block = nlines//n
for i in range(n):
    chunk = lines[i*block:(i+1)*(block)]
    with open('davinci%i.txt' % i, 'w') as f:
        f.write('\n'.join(chunk))

In [17]:

import os
cwd = os.path.abspath(os.getcwd())
fnames = [ os.path.join(cwd, 'davinci%i.txt' % i) for i in range(n)]

In [16]:

from IPython import parallel
rc = parallel.Client()

In [29]:

view = rc.load_balanced_view()
eall = rc[:]
eall.push(dict(
    non_word=non_word,
    yield_words=yield_words,
    common_words=common_words,
    normalize_word=normalize_word,
))

Out[29]:

<AsyncResult: _push>

Exercise: parallel ngrams¶

Write a version of ngrams that runs in parallel, rejoining the results into a single count dict.

In [ ]:

def ngrams_parallel(view, fnames, n=1):
    """Compute ngrams in parallel
    
    view - An IPython View
    fnames - The filenames containing the split data.
    """
    pass

In [22]:

%load ../soln/ngrams.py

In [30]:

print("Parallel ngrams")
%time pcounts = ngrams_parallel(view, fnames, 3)
print_common(pcounts, 10)

Parallel ngrams
CPU times: user 403 ms, sys: 80.3 ms, total: 483 ms

Wall time: 1.36 s
  light and shade

 98
     the same way 44
the luminous body 33
  between the eye 31
the space between 29
      pen and ink 29
leonardo da vinci 28
   the solar rays 27
   the right hand 27
space between the 27

A bit more data¶

Download some Project Gutenberg samples from ntlk (avoid rate-limiting on PG itself)

In [25]:

gutenberg_samples = 'http://nltk.github.com/nltk_data/packages/corpora/gutenberg.zip'
if not os.path.isdir('gutenberg'):
    if not os.path.exists('gutenberg.zip'):
        urlretrieve(gutenberg_samples, 'gutenberg.zip')
    !unzip gutenberg.zip

import glob
gutenberg_files = glob.glob(os.path.abspath(os.path.join('gutenberg', '*.txt')))
# remove the bible, because it's too big relative to the rest
gutenberg_files.remove(os.path.abspath(os.path.join('gutenberg', 'bible-kjv.txt')))

In [26]:

ls gutenberg

README                   austen-sense.txt         bryant-stories.txt       chesterton-ball.txt      edgeworth-parents.txt    shakespeare-caesar.txt   whitman-leaves.txt
austen-emma.txt          bible-kjv.txt            burgess-busterbrown.txt  chesterton-brown.txt     melville-moby_dick.txt   shakespeare-hamlet.txt
austen-persuasion.txt    blake-poems.txt          carroll-alice.txt        chesterton-thursday.txt  milton-paradise.txt      shakespeare-macbeth.txt

In [31]:

print("Parallel ngrams across several books")
%time pcounts = ngrams_parallel(view, gutenberg_files, 3)
print()
print_common(pcounts, 10)
pcounts = ngrams_parallel(view, gutenberg_files, 4)
print()
print_common(pcounts, 10)

Parallel ngrams across several books
CPU times: user 1.55 s, sys: 229 ms, total: 1.78 s

Wall time: 5.69 s

     a great deal

 175
       i dare say 107
farmer browns boy 88
  the sperm whale 86
    the same time 84
      i dont know 76
     two or three 74
    a few minutes 73
  the white whale 71
       mr and mrs 71

   at the same time

 76
    a great deal of 66
 for the first time 48
     in a low voice 36
   i should like to 36
    out of the room 34
 of the sperm whale 31
much obliged to you 29
  i beg your pardon 28
 at the same moment 26