Description
I recently wrote a Python script to download files from the Library of Congress (LOC) based on a search query. The code fetches metadata, extracts file URLs of a specific format (e.g., JPEG), and downloads them into a designated directory. I'm seeking feedback on its structure, efficiency, and readability.
Functionality
The script uses the Library of Congress API to:
- Retrieve Item IDs: Query the API for items matching the search URL and parse their IDs.
- Get File URLs: For each item, extract URLs for files of a specified MIME type (e.g., image/jpeg for .jpg).
- Download Files: Save the matching files locally with a logical directory and naming structure.
Key features include:
Code Walkthrough
Initialization (__init__): Validates inputs like search_url, file_extension, and save_to. Maps the specified file extension to a MIME type using MIME_TYPE_MAP.
Data Fetching (fetch_json): Sends HTTP requests with retries on failure. Ensures the response is JSON, logging warnings otherwise.
Item ID Retrieval (get_item_ids): Parses paginated results from the LOC API to collect relevant item IDs. Filters out items that don’t match criteria (e.g., excluding collections).
File URL Extraction (get_image_urls): Iterates through items, gathering file URLs matching the specified MIME type. Implements rate limiting to respect the server.
File Downloading (download_files): Saves files to a designated directory, ensuring unique filenames. Handles streaming file downloads in chunks for large files.
Main Workflow (run): Combines the above steps: fetch item IDs, extract file URLs, and download files.
Source Code
import requests
import os
import time
import logging
from tenacity import retry, wait_exponential, stop_after_attempt
from typing import List, Dict, Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
class LOCImageDownloader:
"""
A class to download files from the Library of Congress based on a search query.
Example:
downloader = LOCImageDownloader(search_url, file_extension, save_to)
downloader.run()
"""
MIME_TYPE_MAP = {
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"tif": "image/tiff",
"tiff": "image/tiff",
"pdf": "application/pdf",
"png": "image/png",
}
def __init__(self, search_url: str, file_extension: str, save_to: str):
"""
Initialize the downloader with search URL, desired file extension, and save path.
Args:
search_url (str): The search URL to query the LOC API.
file_extension (str): Desired file extension (e.g., 'jpg', 'pdf').
save_to (str): Directory to save downloaded files.
Raises:
ValueError: If the specified file extension is unsupported.
"""
if not search_url.startswith("http"):
raise ValueError("Invalid search_url. Must start with http or https.")
if not os.access(os.path.dirname(save_to) or ".", os.W_OK):
raise ValueError(f"Save directory '{save_to}' is not writable.")
self.search_url = search_url
self.file_extension = file_extension.lower()
self.mime_type = self.MIME_TYPE_MAP.get(self.file_extension)
if not self.mime_type:
raise ValueError(f"Unsupported file extension: {self.file_extension}")
self.save_to = save_to
os.makedirs(self.save_to, exist_ok=True)
self.session = requests.Session()
@retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
def fetch_json(self, url: str, params: Optional[Dict] = None) -> Optional[Dict]:
"""Fetch JSON data from a given URL."""
try:
response = self.session.get(url, params=params, timeout=10)
response.raise_for_status()
if 'application/json' not in response.headers.get("Content-Type", ""):
logging.warning(f"Unexpected Content-Type for {url}: {response.headers.get('Content-Type')}")
return None
return response.json()
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None
def get_item_ids(self) -> List[str]:
"""Retrieve item IDs from the LOC search URL."""
logging.info("Fetching item IDs from the Library of Congress...")
item_ids, url, count = [], self.search_url, 0
max_pages = 100 # Safeguard for pagination loops
pages_processed = 0
while url and pages_processed < max_pages:
data = self.fetch_json(url, {"fo": "json", "c": 100, "at": "results,pagination"})
if not data:
break
results = data.get("results", [])
for result in results:
if "item" in result.get("id", "") and "collection" not in result.get("original_format", []):
item_ids.append(result["id"])
count += len(results)
url = data.get("pagination", {}).get("next") # Get the next page URL
pages_processed += 1
# Be kind to the server (this is a magic number provided by the LOC)
time.sleep(1)
logging.info(f"Found {len(item_ids)} items.")
return item_ids
def get_image_urls(self, item_ids: List[str]) -> List[Dict[str, str]]:
"""Retrieve URLs for files matching the specified file type."""
logging.info("Retrieving file URLs for the specified items...")
file_urls, processed = [], 0
for item_url in item_ids:
data = self.fetch_json(item_url, {"fo": "json"})
processed += 1
logging.info(f"Processing item {processed}/{len(item_ids)}...")
if not data:
continue
for resource in data.get("resources", []):
for file_info in self.flatten_files(resource.get("files", [])):
if file_info.get("mimetype") == self.mime_type:
file_urls.append({"image_url": file_info["url"], "item_id": item_url})
# Be kind to the server (this is a magic number provided by the LOC)
time.sleep(2)
logging.info(f"Found {len(file_urls)} matching files.")
return file_urls
@staticmethod
def flatten_files(files: List) -> List[Dict]:
"""Flatten a potentially nested list of files into a single list of dictionaries."""
if not isinstance(files, list):
raise ValueError("Expected a list for 'files'")
return [item for sublist in files for item in (sublist if isinstance(sublist, list) else [sublist])]
def download_files(self, file_urls: List[Dict[str, str]]) -> None:
"""Download files from the given URLs and save them to the specified directory."""
logging.info("Downloading files...")
for index, file_info in enumerate(file_urls, start=1):
file_url = file_info["image_url"]
item_id = file_info["item_id"].strip("/").split("/")[-1]
save_path = os.path.join(self.save_to, item_id)
os.makedirs(save_path, exist_ok=True)
# Determine the filename
if "image-services/iiif" in file_url:
url_parts = file_url.split("/")
filename_part = next((part.split(":")[-1] for part in url_parts if "service:" in part), "image")
ext = file_url.split(".")[-1]
filename = f"{filename_part}.{ext}"
else:
filename = file_url.split("/")[-1]
# Ensure unique filename
file_path = os.path.join(save_path, filename)
counter = 1
while os.path.exists(file_path):
file_path = os.path.join(save_path, f"{filename.rsplit('.', 1)[0]}_{counter}.{self.file_extension}")
counter += 1
logging.info(f"[{index}/{len(file_urls)}] Downloading {file_url} as {file_path}...")
# Download the file
try:
with self.session.get(file_url, stream=True) as response:
response.raise_for_status()
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
except requests.RequestException as e:
logging.error(f"Error downloading {file_url}: {e}")
# Be kind to the server (this is a magic number provided by the LOC)
time.sleep(2)
def run(self) -> None:
"""Execute the downloader: fetch item IDs, retrieve file URLs, and download files."""
item_ids = self.get_item_ids()
if not item_ids:
logging.error("No items found.")
return
file_urls = self.get_image_urls(item_ids)
if not file_urls:
logging.error("No matching files found.")
return
self.download_files(file_urls)
Feedback Areas
The code seems to work both well and as expected. I haven't done much web scraping previously, however, and so I'm looking for general feedback. I'm most concerned about the following:
- Overall Readability: Is the overall flow of the program intuitive and maintainable?
- Error Handling: Are exceptions handled effectively, particularly for network-related errors? Are there additional checks or validations I should include?
- Best Practices: Are there any Pythonic improvements or standard practices I should adopt?