Downloading images from Wikimedia Commons¶
This notebook defines a few functions to download images with specified search terms from Wikimedia, via the mediawiki search API.
The results are used in the Parallel face detection notebook.
In [2]:
import sys, os
import requests
try:
import requests_cache
except ImportError:
print("no cache, no worries")
else:
requests_cache.install_cache("mediawiki")
api_url = "http://commons.wikimedia.org/w/api.php"
no cache, no worries
In [3]:
def api_request(**kwargs):
"""Make a request of the Wikimedia Commons API
Returns data after parsing JSON
"""
sys.stdout.write('.')
sys.stdout.flush()
params = dict(
action='query',
format='json',
)
params.update(kwargs)
r = requests.get(api_url, params=params)
r.raise_for_status()
return r.json()
import json
def search_images(search, limit=100, size_limit=400000):
"""search wikimedia commons for a given term
returns a list of `limit` URLs for images
"""
urls = []
continue_params = {}
while limit > 0:
data = api_request(
srnamespace=6,
prop='imageinfo',
list='search',
srsearch=search,
srlimit=min(limit, 50),
**continue_params
)
continue_params = data['query-continue']['search']
total = data['query']['searchinfo']['totalhits']
results = data['query']['search']
for r in results:
title = r['title']
data = api_request(
prop='imageinfo',
titles=title,
iiprop='url|size|mime')
imageinfo = data['query']['pages'].values()[0]['imageinfo'][0]
if imageinfo['mime'] in ('image/png', 'image/jpeg') and imageinfo['size'] <= size_limit:
urls.append(imageinfo['url'])
limit -= 1
return urls
In [4]:
def download_images(search, n):
"""download images from mediawiki commons to folders based on the search term"""
if not os.path.exists('images'):
os.mkdir('images')
tagdir = os.path.join('images', search)
if not os.path.exists(tagdir):
os.mkdir(tagdir)
for url in search_images(search, n):
r = requests.get(url)
fname = url.rsplit('/')[-1]
dest = os.path.join(tagdir, fname)
# print("downloading %s => %s" % (url, dest))
sys.stdout.write('+')
sys.stdout.flush()
with open(dest, 'wb') as f:
f.write(r.content)
In [5]:
download_images('portrait', 100)
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
In [74]:
download_images('face', 100)
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
In [75]:
download_images('headshot', 100)
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
In [2]:
download_images('castle', 100)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-bf59cc0f091f> in <module>() ----> 1 download_images('castle', 100) NameError: name 'download_images' is not defined