{
 "metadata": {
  "name": "",
  "signature": "sha256:672af01d3bf0e650dc0e5b61c1406d95f9c484b8d5330113ab536702a7344d01"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Counting words"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Because what's a parallel computing demo without counting words?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import print_function"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Some utilitiles for excluding commmon phrases and normalizing words"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import re\n",
      "non_word = re.compile(r'[\\W\\d]+', re.UNICODE)\n",
      "\n",
      "def normalize_word(word):\n",
      "    \"\"\"normalize a word\n",
      "    \n",
      "    simply strips non-word characters and case\n",
      "    \"\"\"\n",
      "    word = word.lower()\n",
      "    word = non_word.sub('', word)\n",
      "    return word\n",
      "\n",
      "common_words = {\n",
      "'the','of','and','in','to','a','is','it','that','which','as','on','by',\n",
      "'be','this','with','are','from','will','at','you','not','for','no','have',\n",
      "'i','or','if','his','its','they','but','their','one','all','he','when',\n",
      "'than','so','these','them','may','see','other','was','has','an','there',\n",
      "'more','we','footnote', 'who', 'had', 'been',  'she', 'do', 'what',\n",
      "'her', 'him', 'my', 'me', 'would', 'could', 'said', 'am', 'were', 'very',\n",
      "'your', 'did', 'not',\n",
      "}\n",
      "\n",
      "def yield_words(filename):\n",
      "    \"\"\"returns a generator of words in a file\"\"\"\n",
      "    import io\n",
      "    with io.open(filename, encoding='latin-1') as f:\n",
      "        for line in f:\n",
      "            for word in line.split():\n",
      "                word = normalize_word(word)\n",
      "                if word:\n",
      "                    yield word\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "A function that reads a file, and returns a dictionary\n",
      "with string keys of phrases of `n` words,\n",
      "whose values"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def ngrams(filename, n=1):\n",
      "    \"\"\"compute ngram counts for the contents of a file\"\"\"\n",
      "    word_iterator = yield_words(filename)\n",
      "    counts = {}\n",
      "    def _count_gram(gram):\n",
      "        common = sum(word in common_words for word in gram)\n",
      "        if common > n / 2.0:\n",
      "            # don't count ngrams that are >= 50% common words\n",
      "            return\n",
      "        sgram = ' '.join(gram)\n",
      "        counts.setdefault(sgram, 0)\n",
      "        counts[sgram] += 1\n",
      "    \n",
      "    gram = []\n",
      "    \n",
      "    # get the first word\n",
      "    while len(gram) < n:\n",
      "        try:\n",
      "            word = next(word_iterator)\n",
      "            if not word:\n",
      "                continue\n",
      "        except StopIteration:\n",
      "            return counts\n",
      "        else:\n",
      "            gram.append(word)\n",
      "    \n",
      "    _count_gram(gram)\n",
      "\n",
      "    while True:\n",
      "        try:\n",
      "            word = next(word_iterator)\n",
      "        except StopIteration:\n",
      "            break\n",
      "        else:\n",
      "            gram.append(word)\n",
      "            gram.pop(0)\n",
      "            _count_gram(gram)\n",
      "    return counts\n",
      "            "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%writefile cathat.txt\n",
      "the cat in the hat is a cat whose hat is big."
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Overwriting cathat.txt\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ngrams('cathat.txt', 1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "{u'big': 1, u'cat': 2, u'hat': 2, u'whose': 1}"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ngrams('cathat.txt', 2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "{u'a cat': 1,\n",
        " u'cat in': 1,\n",
        " u'cat whose': 1,\n",
        " u'hat is': 2,\n",
        " u'is big': 1,\n",
        " u'the cat': 1,\n",
        " u'the hat': 1,\n",
        " u'whose hat': 1}"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now fetch some interesting data from Project Gutenberg:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "try: \n",
      "    from urllib.request import urlretrieve # py3\n",
      "except ImportError:\n",
      "    from urllib import urlretrieve # py2\n",
      "\n",
      "davinci_url = \"http://www.gutenberg.org/cache/epub/5000/pg5000.txt\"\n",
      "\n",
      "if not os.path.exists('davinci.txt'):\n",
      "    # download from project gutenberg\n",
      "    print(\"Downloading Da Vinci's notebooks from Project Gutenberg\")\n",
      "    urlretrieve(davinci_url, 'davinci.txt')\n",
      "\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import sys\n",
      "\n",
      "def print_common(freqs, n=10):\n",
      "    \"\"\"Print the n most common keys by count.\"\"\"\n",
      "    words, counts = freqs.keys(), freqs.values()\n",
      "    items = zip(counts, words)\n",
      "    items.sort(reverse=True)\n",
      "    justify = 0\n",
      "    for (count, word) in items[:n]:\n",
      "        justify = max(justify, len(word))\n",
      "    \n",
      "    for (count, word) in items[:n]:\n",
      "        print(word.rjust(justify), count)\n",
      "    sys.stdout.flush()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Run the serial version\n",
      "print(\"Serial word frequency count:\")\n",
      "%time counts = ngrams('davinci.txt', 1)\n",
      "print_common(counts, 10)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Serial word frequency count:\n",
        "CPU times: user 1.61 s, sys: 16.8 ms, total: 1.63 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 1.64 s\n",
        "   light 852\n",
        "     eye 591\n",
        "    same 536\n",
        "  shadow 507\n",
        "    body 456\n",
        " between 446\n",
        "   water 425\n",
        "    seen 415\n",
        "leonardo 414\n",
        "    into 402\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's split the file"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# split the davinci.txt into one file per engine:\n",
      "text = open('davinci.txt').read()\n",
      "lines = text.splitlines()\n",
      "nlines = len(lines)\n",
      "n = 10\n",
      "\n",
      "block = nlines//n\n",
      "for i in range(n):\n",
      "    chunk = lines[i*block:(i+1)*(block)]\n",
      "    with open('davinci%i.txt' % i, 'w') as f:\n",
      "        f.write('\\n'.join(chunk))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "cwd = os.path.abspath(os.getcwd())\n",
      "fnames = [ os.path.join(cwd, 'davinci%i.txt' % i) for i in range(n)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from IPython import parallel\n",
      "rc = parallel.Client()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "view = rc.load_balanced_view()\n",
      "eall = rc[:]\n",
      "eall.push(dict(\n",
      "    non_word=non_word,\n",
      "    yield_words=yield_words,\n",
      "    common_words=common_words,\n",
      "    normalize_word=normalize_word,\n",
      "))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 29,
       "text": [
        "<AsyncResult: _push>"
       ]
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Exercise: parallel ngrams"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Write a version of ngrams that runs in parallel,\n",
      "rejoining the results into a single count dict."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def ngrams_parallel(view, fnames, n=1):\n",
      "    \"\"\"Compute ngrams in parallel\n",
      "    \n",
      "    view - An IPython View\n",
      "    fnames - The filenames containing the split data.\n",
      "    \"\"\"\n",
      "    pass"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%load ../soln/ngrams.py"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"Parallel ngrams\")\n",
      "%time pcounts = ngrams_parallel(view, fnames, 3)\n",
      "print_common(pcounts, 10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Parallel ngrams\n",
        "CPU times: user 403 ms, sys: 80.3 ms, total: 483 ms"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 1.36 s\n",
        "  light and shade"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 98\n",
        "     the same way 44\n",
        "the luminous body 33\n",
        "  between the eye 31\n",
        "the space between 29\n",
        "      pen and ink 29\n",
        "leonardo da vinci 28\n",
        "   the solar rays 27\n",
        "   the right hand 27\n",
        "space between the 27\n"
       ]
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "A bit more data"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Download some Project Gutenberg samples from ntlk (avoid rate-limiting on PG itself)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gutenberg_samples = 'http://nltk.github.com/nltk_data/packages/corpora/gutenberg.zip'\n",
      "if not os.path.isdir('gutenberg'):\n",
      "    if not os.path.exists('gutenberg.zip'):\n",
      "        urlretrieve(gutenberg_samples, 'gutenberg.zip')\n",
      "    !unzip gutenberg.zip\n",
      "\n",
      "import glob\n",
      "gutenberg_files = glob.glob(os.path.abspath(os.path.join('gutenberg', '*.txt')))\n",
      "# remove the bible, because it's too big relative to the rest\n",
      "gutenberg_files.remove(os.path.abspath(os.path.join('gutenberg', 'bible-kjv.txt')))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ls gutenberg"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "README                   austen-sense.txt         bryant-stories.txt       chesterton-ball.txt      edgeworth-parents.txt    shakespeare-caesar.txt   whitman-leaves.txt\r\n",
        "austen-emma.txt          bible-kjv.txt            burgess-busterbrown.txt  chesterton-brown.txt     melville-moby_dick.txt   shakespeare-hamlet.txt\r\n",
        "austen-persuasion.txt    blake-poems.txt          carroll-alice.txt        chesterton-thursday.txt  milton-paradise.txt      shakespeare-macbeth.txt\r\n"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"Parallel ngrams across several books\")\n",
      "%time pcounts = ngrams_parallel(view, gutenberg_files, 3)\n",
      "print()\n",
      "print_common(pcounts, 10)\n",
      "pcounts = ngrams_parallel(view, gutenberg_files, 4)\n",
      "print()\n",
      "print_common(pcounts, 10)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Parallel ngrams across several books\n",
        "CPU times: user 1.55 s, sys: 229 ms, total: 1.78 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 5.69 s\n",
        "\n",
        "     a great deal"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 175\n",
        "       i dare say 107\n",
        "farmer browns boy 88\n",
        "  the sperm whale 86\n",
        "    the same time 84\n",
        "      i dont know 76\n",
        "     two or three 74\n",
        "    a few minutes 73\n",
        "  the white whale 71\n",
        "       mr and mrs 71\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "   at the same time"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 76\n",
        "    a great deal of 66\n",
        " for the first time 48\n",
        "     in a low voice 36\n",
        "   i should like to 36\n",
        "    out of the room 34\n",
        " of the sperm whale 31\n",
        "much obliged to you 29\n",
        "  i beg your pardon 28\n",
        " at the same moment 26\n"
       ]
      }
     ],
     "prompt_number": 31
    }
   ],
   "metadata": {}
  }
 ]
}