{ "metadata": { "name": "", "signature": "sha256:672af01d3bf0e650dc0e5b61c1406d95f9c484b8d5330113ab536702a7344d01" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Counting words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Because what's a parallel computing demo without counting words?" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some utilitiles for excluding commmon phrases and normalizing words" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import re\n", "non_word = re.compile(r'[\\W\\d]+', re.UNICODE)\n", "\n", "def normalize_word(word):\n", " \"\"\"normalize a word\n", " \n", " simply strips non-word characters and case\n", " \"\"\"\n", " word = word.lower()\n", " word = non_word.sub('', word)\n", " return word\n", "\n", "common_words = {\n", "'the','of','and','in','to','a','is','it','that','which','as','on','by',\n", "'be','this','with','are','from','will','at','you','not','for','no','have',\n", "'i','or','if','his','its','they','but','their','one','all','he','when',\n", "'than','so','these','them','may','see','other','was','has','an','there',\n", "'more','we','footnote', 'who', 'had', 'been', 'she', 'do', 'what',\n", "'her', 'him', 'my', 'me', 'would', 'could', 'said', 'am', 'were', 'very',\n", "'your', 'did', 'not',\n", "}\n", "\n", "def yield_words(filename):\n", " \"\"\"returns a generator of words in a file\"\"\"\n", " import io\n", " with io.open(filename, encoding='latin-1') as f:\n", " for line in f:\n", " for word in line.split():\n", " word = normalize_word(word)\n", " if word:\n", " yield word\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "A function that reads a file, and returns a dictionary\n", "with string keys of phrases of `n` words,\n", "whose values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def ngrams(filename, n=1):\n", " \"\"\"compute ngram counts for the contents of a file\"\"\"\n", " word_iterator = yield_words(filename)\n", " counts = {}\n", " def _count_gram(gram):\n", " common = sum(word in common_words for word in gram)\n", " if common > n / 2.0:\n", " # don't count ngrams that are >= 50% common words\n", " return\n", " sgram = ' '.join(gram)\n", " counts.setdefault(sgram, 0)\n", " counts[sgram] += 1\n", " \n", " gram = []\n", " \n", " # get the first word\n", " while len(gram) < n:\n", " try:\n", " word = next(word_iterator)\n", " if not word:\n", " continue\n", " except StopIteration:\n", " return counts\n", " else:\n", " gram.append(word)\n", " \n", " _count_gram(gram)\n", "\n", " while True:\n", " try:\n", " word = next(word_iterator)\n", " except StopIteration:\n", " break\n", " else:\n", " gram.append(word)\n", " gram.pop(0)\n", " _count_gram(gram)\n", " return counts\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile cathat.txt\n", "the cat in the hat is a cat whose hat is big." ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Overwriting cathat.txt\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "ngrams('cathat.txt', 1)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "{u'big': 1, u'cat': 2, u'hat': 2, u'whose': 1}" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "ngrams('cathat.txt', 2)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "{u'a cat': 1,\n", " u'cat in': 1,\n", " u'cat whose': 1,\n", " u'hat is': 2,\n", " u'is big': 1,\n", " u'the cat': 1,\n", " u'the hat': 1,\n", " u'whose hat': 1}" ] } ], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now fetch some interesting data from Project Gutenberg:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "try: \n", " from urllib.request import urlretrieve # py3\n", "except ImportError:\n", " from urllib import urlretrieve # py2\n", "\n", "davinci_url = \"http://www.gutenberg.org/cache/epub/5000/pg5000.txt\"\n", "\n", "if not os.path.exists('davinci.txt'):\n", " # download from project gutenberg\n", " print(\"Downloading Da Vinci's notebooks from Project Gutenberg\")\n", " urlretrieve(davinci_url, 'davinci.txt')\n", "\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "import sys\n", "\n", "def print_common(freqs, n=10):\n", " \"\"\"Print the n most common keys by count.\"\"\"\n", " words, counts = freqs.keys(), freqs.values()\n", " items = zip(counts, words)\n", " items.sort(reverse=True)\n", " justify = 0\n", " for (count, word) in items[:n]:\n", " justify = max(justify, len(word))\n", " \n", " for (count, word) in items[:n]:\n", " print(word.rjust(justify), count)\n", " sys.stdout.flush()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# Run the serial version\n", "print(\"Serial word frequency count:\")\n", "%time counts = ngrams('davinci.txt', 1)\n", "print_common(counts, 10)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Serial word frequency count:\n", "CPU times: user 1.61 s, sys: 16.8 ms, total: 1.63 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 1.64 s\n", " light 852\n", " eye 591\n", " same 536\n", " shadow 507\n", " body 456\n", " between 446\n", " water 425\n", " seen 415\n", "leonardo 414\n", " into 402\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's split the file" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# split the davinci.txt into one file per engine:\n", "text = open('davinci.txt').read()\n", "lines = text.splitlines()\n", "nlines = len(lines)\n", "n = 10\n", "\n", "block = nlines//n\n", "for i in range(n):\n", " chunk = lines[i*block:(i+1)*(block)]\n", " with open('davinci%i.txt' % i, 'w') as f:\n", " f.write('\\n'.join(chunk))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "cwd = os.path.abspath(os.getcwd())\n", "fnames = [ os.path.join(cwd, 'davinci%i.txt' % i) for i in range(n)]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython import parallel\n", "rc = parallel.Client()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "view = rc.load_balanced_view()\n", "eall = rc[:]\n", "eall.push(dict(\n", " non_word=non_word,\n", " yield_words=yield_words,\n", " common_words=common_words,\n", " normalize_word=normalize_word,\n", "))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "" ] } ], "prompt_number": 29 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Exercise: parallel ngrams" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Write a version of ngrams that runs in parallel,\n", "rejoining the results into a single count dict." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def ngrams_parallel(view, fnames, n=1):\n", " \"\"\"Compute ngrams in parallel\n", " \n", " view - An IPython View\n", " fnames - The filenames containing the split data.\n", " \"\"\"\n", " pass" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%load ../soln/ngrams.py" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Parallel ngrams\")\n", "%time pcounts = ngrams_parallel(view, fnames, 3)\n", "print_common(pcounts, 10)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Parallel ngrams\n", "CPU times: user 403 ms, sys: 80.3 ms, total: 483 ms" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 1.36 s\n", " light and shade" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 98\n", " the same way 44\n", "the luminous body 33\n", " between the eye 31\n", "the space between 29\n", " pen and ink 29\n", "leonardo da vinci 28\n", " the solar rays 27\n", " the right hand 27\n", "space between the 27\n" ] } ], "prompt_number": 30 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "A bit more data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download some Project Gutenberg samples from ntlk (avoid rate-limiting on PG itself)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "gutenberg_samples = 'http://nltk.github.com/nltk_data/packages/corpora/gutenberg.zip'\n", "if not os.path.isdir('gutenberg'):\n", " if not os.path.exists('gutenberg.zip'):\n", " urlretrieve(gutenberg_samples, 'gutenberg.zip')\n", " !unzip gutenberg.zip\n", "\n", "import glob\n", "gutenberg_files = glob.glob(os.path.abspath(os.path.join('gutenberg', '*.txt')))\n", "# remove the bible, because it's too big relative to the rest\n", "gutenberg_files.remove(os.path.abspath(os.path.join('gutenberg', 'bible-kjv.txt')))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "ls gutenberg" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "README austen-sense.txt bryant-stories.txt chesterton-ball.txt edgeworth-parents.txt shakespeare-caesar.txt whitman-leaves.txt\r\n", "austen-emma.txt bible-kjv.txt burgess-busterbrown.txt chesterton-brown.txt melville-moby_dick.txt shakespeare-hamlet.txt\r\n", "austen-persuasion.txt blake-poems.txt carroll-alice.txt chesterton-thursday.txt milton-paradise.txt shakespeare-macbeth.txt\r\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Parallel ngrams across several books\")\n", "%time pcounts = ngrams_parallel(view, gutenberg_files, 3)\n", "print()\n", "print_common(pcounts, 10)\n", "pcounts = ngrams_parallel(view, gutenberg_files, 4)\n", "print()\n", "print_common(pcounts, 10)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Parallel ngrams across several books\n", "CPU times: user 1.55 s, sys: 229 ms, total: 1.78 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 5.69 s\n", "\n", " a great deal" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 175\n", " i dare say 107\n", "farmer browns boy 88\n", " the sperm whale 86\n", " the same time 84\n", " i dont know 76\n", " two or three 74\n", " a few minutes 73\n", " the white whale 71\n", " mr and mrs 71\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", " at the same time" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 76\n", " a great deal of 66\n", " for the first time 48\n", " in a low voice 36\n", " i should like to 36\n", " out of the room 34\n", " of the sperm whale 31\n", "much obliged to you 29\n", " i beg your pardon 28\n", " at the same moment 26\n" ] } ], "prompt_number": 31 } ], "metadata": {} } ] }