diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile index 11cd773428e..8aefbfe8f47 100644 --- a/.ci/docker/Dockerfile +++ b/.ci/docker/Dockerfile @@ -15,15 +15,11 @@ RUN bash ./install_user.sh && rm install_user.sh COPY ./common/install_docs_reqs.sh install_docs_reqs.sh RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh -# Install conda and other packages -ENV ANACONDA_PYTHON_VERSION=3.10 -ENV CONDA_CMAKE yes -ENV DOCS yes -ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH -COPY ./requirements.txt /opt/conda/ -COPY ./common/install_conda.sh install_conda.sh -COPY ./common/common_utils.sh common_utils.sh -RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements.txt +COPY ./common/install_pip_requirements.sh install_pip_requirements.sh +COPY ./requirements.txt requirements.txt +RUN bash ./install_pip_requirements.sh && rm install_pip_requirements.sh + +RUN ln -s /usr/bin/python3 /usr/bin/python USER ci-user CMD ["bash"] diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 31f42fdbd85..f40c45fea3d 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -10,9 +10,10 @@ set -exu IMAGE_NAME="$1" shift -export UBUNTU_VERSION="20.04" +export UBUNTU_VERSION="22.04" +export CUDA_VERSION="12.6.3" -export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}" +export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" echo "Building ${IMAGE_NAME} Docker image" docker build \ diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh deleted file mode 100644 index b20286a4099..00000000000 --- a/.ci/docker/common/common_utils.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Work around bug where devtoolset replaces sudo and breaks it. -as_ci_user() { - # NB: unsetting the environment variables works around a conda bug - # https://github.com/conda/conda/issues/6576 - # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation - # NB: This must be run from a directory that the user has access to, - # works around https://github.com/conda/conda-package-handling/pull/34 - sudo -E -H -u ci-user env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* -} - -conda_install() { - # Ensure that the install command don't upgrade/downgrade Python - # This should be called as - # conda_install pkg1 pkg2 ... [-c channel] - as_ci_user conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* -} - -conda_run() { - as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* -} - -pip_install() { - as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* -} diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh index 7fcb81ffeaf..3100b550a89 100644 --- a/.ci/docker/common/install_base.sh +++ b/.ci/docker/common/install_base.sh @@ -10,7 +10,7 @@ install_ubuntu() { apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ - cmake=3.16* \ + cmake=3.22* \ curl \ git \ wget \ @@ -27,7 +27,9 @@ install_ubuntu() { libglfw3-dev \ sox \ libsox-dev \ - libsox-fmt-all + libsox-fmt-all \ + python3-pip \ + python3-dev # Cleanup package manager apt-get autoclean && apt-get clean diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh deleted file mode 100644 index cdc4f135e2c..00000000000 --- a/.ci/docker/common/install_conda.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -set -ex - -# Optionally install conda -if [ -n "$ANACONDA_PYTHON_VERSION" ]; then - BASE_URL="https://repo.anaconda.com/miniconda" - - MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) - MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) - - CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" - - mkdir -p /opt/conda - chown ci-user:ci-user /opt/conda - - source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" - - pushd /tmp - wget -q "${BASE_URL}/${CONDA_FILE}" - # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 - as_ci_user bash "${CONDA_FILE}" -b -f -p "/opt/conda" - popd - - # NB: Don't do this, rely on the rpath to get it right - #echo "/opt/conda/lib" > /etc/ld.so.conf.d/conda-python.conf - #ldconfig - sed -e 's|PATH="\(.*\)"|PATH="/opt/conda/bin:\1"|g' -i /etc/environment - export PATH="/opt/conda/bin:$PATH" - - # Ensure we run conda in a directory that the user has write access to - pushd /opt/conda - - # Prevent conda from updating to 4.14.0, which causes docker build failures - # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d - # Uncomment the below when resolved to track the latest conda update - # as_ci_user conda update -y -n base conda - - # Install correct Python version - as_ci_user conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" - - # Use conda cmake in some cases. Conda cmake will be newer than our supported - # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those - # following builds that we know should use conda. Specifically, Ubuntu bionic - # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda - conda_install cmake - - # Install pip packages - pip_install -r /opt/conda/requirements.txt - - apt-get update - apt-get -y install expect-dev - - popd -fi diff --git a/.ci/docker/common/install_pip_requirements.sh b/.ci/docker/common/install_pip_requirements.sh new file mode 100644 index 00000000000..a548d200462 --- /dev/null +++ b/.ci/docker/common/install_pip_requirements.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -ex + +# Install pip packages +pip install --upgrade pip +pip install -r ./requirements.txt diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 7aede51ddaa..086633cf043 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -1,21 +1,30 @@ # --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version. -# Refer to ./jenkins/build.sh for tutorial build instructions +# Refer to ./jenkins/build.sh for tutorial build instructions. -sphinx==5.0.0 -sphinx-gallery==0.11.1 -sphinx_design -docutils==0.16 -sphinx-copybutton -pypandoc==1.12 -pandocfilters -markdown +# Sphinx dependencies +sphinx==7.2.6 +sphinx-gallery==0.19.0 +sphinx-reredirects==0.1.4 +sphinx_design==0.6.1 +docutils>=0.18.1,<0.21 +sphinx-copybutton==0.5.2 +sphinx_sitemap==2.7.1 +sphinxcontrib-mermaid==1.0.0 +sphinxcontrib.katex==0.9.10 +pypandoc==1.15 +pandocfilters==1.5.1 +markdown==3.8.2 + +# PyTorch Theme +pytorch_sphinx_theme2==0.2.0 + +# Tutorial dependencies tqdm==4.66.1 numpy==1.24.4 matplotlib librosa -torch==2.3 +torch==2.9 torchvision -torchtext torchdata networkx PyHamcrest @@ -28,24 +37,21 @@ tensorboard jinja2==3.1.3 pytorch-lightning torchx -torchrl==0.3.0 -tensordict==0.3.0 -ax-platform -nbformat>==5.9.2 +torchrl==0.9.2 +tensordict==0.9.1 +# For ax_multiobjective_nas_tutorial.py +ax-platform>=0.4.0,<0.5.0 +nbformat>=5.9.2 datasets transformers -torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable onnx -onnxscript +onnxscript>=0.2.2 onnxruntime evaluate accelerate>=0.20.1 importlib-metadata==6.8.0 -# PyTorch Theme --e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme - ipython sphinxcontrib.katex @@ -63,9 +69,10 @@ gym-super-mario-bros==7.4.0 pyopengl gymnasium[mujoco]==0.27.0 timm -iopath -pygame==2.1.2 +pygame==2.6.0 pycocotools semilearn==0.3.2 -torchao==0.0.3 +torchao==0.10.0 segment_anything==1.0 +torchrec==1.2.0; platform_system == "Linux" +fbgemm-gpu==1.2.0; platform_system == "Linux" diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index c5dd6181d4a..937417f4999 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -13,17 +13,17 @@ body: - type: textarea attributes: label: Add Link - description: | + description: | **Add the link to the tutorial*** placeholder: | Link to the tutorial on the website: validations: - required: true + required: true - type: textarea attributes: label: Describe the bug - description: | - **Add the bug description** + description: | + **Add the bug description** placeholder: | Provide a detailed description of the issue with code samples if relevant ```python diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index c6e0885eaa9..c1c449c29fe 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -18,7 +18,7 @@ body: - type: textarea attributes: label: Existing tutorials on this topic - description: | + description: | **Add a list of existing tutorials on the same topic.** placeholder: | List tutorials that already explain this functionality if exist. On pytorch.org or elsewhere. diff --git a/.github/scripts/check_redirects.sh b/.github/scripts/check_redirects.sh new file mode 100755 index 00000000000..6aa31819820 --- /dev/null +++ b/.github/scripts/check_redirects.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +if [ "$CURRENT_BRANCH" == "$BASE_BRANCH" ]; then + echo "Running on $BASE_BRANCH branch. Skipping check." + exit 0 +fi + + +# Get list of deleted or renamed files in this branch compared to base +DELETED_FILES=$(git diff --name-status $BASE_BRANCH $CURRENT_BRANCH --diff-filter=DR | awk '{print $2}' | grep -E '\.(rst|py|md)$' | grep -v 'redirects.py') +# Check if any deleted or renamed files were found +if [ -z "$DELETED_FILES" ]; then + echo "No deleted or renamed files found. Skipping check." + exit 0 +fi + +echo "Deleted or renamed files:" +echo "$DELETED_FILES" + +# Check if redirects.py has been updated +REDIRECTS_UPDATED=$(git diff --name-status $BASE_BRANCH $CURRENT_BRANCH --diff-filter=AM | grep 'redirects.py' && echo "yes" || echo "no") + +if [ "$REDIRECTS_UPDATED" == "no" ]; then + echo "ERROR: Files were deleted or renamed but redirects.py was not updated. Please update .github/scripts/redirects.py to redirect these files." + exit 1 +fi + +# Check if each deleted file has a redirect entry +MISSING_REDIRECTS=0 +for FILE in $DELETED_FILES; do + # Convert file path to URL path format (remove extension and adjust path) + REDIRECT_PATH=$(echo $FILE | sed -E 's/(.+)_source\/(.+)\.(py|rst|md)$/\1\/\2.html/') + + # Check if this path exists in redirects.py as a key. We don't check for values. + if ! grep -q "\"$REDIRECT_PATH\":" redirects.py; then + echo "ERROR: Missing redirect for deleted file: $FILE (should have entry for \"$REDIRECT_PATH\")" + MISSING_REDIRECTS=1 + fi +done + +if [ $MISSING_REDIRECTS -eq 1 ]; then + echo "ERROR: Please add redirects for all deleted/renamed files to redirects.py" + exit 1 +fi + +echo "All deleted/renamed files have proper redirects. Check passed!" diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py index a8e512a3204..7241e1370ce 100644 --- a/.github/scripts/docathon-label-sync.py +++ b/.github/scripts/docathon-label-sync.py @@ -25,12 +25,12 @@ def main(): issue_number = int(re.findall(r'#(\d{1,5})', pull_request_body)[0]) issue = repo.get_issue(issue_number) issue_labels = issue.labels - docathon_label_present = any(label.name == 'docathon-h1-2024' for label in issue_labels) + docathon_label_present = any(label.name == 'docathon-h1-2025' for label in issue_labels) # if the issue has a docathon label, add all labels from the issue to the PR. if not docathon_label_present: - print("The 'docathon-h1-2024' label is not present in the issue.") - return + print("The 'docathon-h1-2025' label is not present in the issue.") + return pull_request_labels = pull_request.get_labels() issue_label_names = [label.name for label in issue_labels] labels_to_add = [label for label in issue_label_names if label not in pull_request_labels] @@ -39,8 +39,8 @@ def main(): return pull_request.add_to_labels(*labels_to_add) print("Labels added to the pull request!") - - + + if __name__ == "__main__": main() diff --git a/.github/workflows/MonthlyLinkCheck.yml b/.github/workflows/MonthlyLinkCheck.yml new file mode 100644 index 00000000000..fb9283862ad --- /dev/null +++ b/.github/workflows/MonthlyLinkCheck.yml @@ -0,0 +1,55 @@ +#Runs once a month and checks links in the repo to ensure they are valid +#If action fails, it creates an issue with the failing links and an "incorrect link" label +#If link is valid but failing, it can be added to the .lycheeignore file +#Action can also be run manually as needed. + + +name: Monthly Link Check +on: + schedule: + - cron: '0 0 1 * *' # Runs at midnight on the first day of every month + workflow_dispatch: # Allows manual triggering of the workflow +jobs: + linkChecker: + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Check Links + id: lychee + uses: lycheeverse/lychee-action@v2 + with: + args: --accept=200,202,403,429 --base $GITHUB_WORKSPACE --no-progress './**/*.md' './**/*.html' './**/*.rst' + token: ${{ secrets.CUSTOM_TOKEN }} + jobSummary: false # Disable default summary + fail: true + - name: Create Filtered Summary + if: always() + run: | + if [ -f ./lychee/out.md ]; then + echo "## Link Check Results (Errors Only)" >> $GITHUB_STEP_SUMMARY + # Extract only error sections, skip redirects + grep -A 10000 "^## Errors per input" ./lychee/out.md | \ + grep -B 10000 "^## Redirects per input" | \ + head -n -1 >> $GITHUB_STEP_SUMMARY || \ + grep -A 10000 "^## Errors per input" ./lychee/out.md >> $GITHUB_STEP_SUMMARY + fi + - name: Create Issue From File + if: failure() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + uses: peter-evans/create-issue-from-file@v5 + with: + title: Broken links detected in docs 🔗 + content-filepath: ./lychee/out.md + labels: 'incorrect link' + #token: ${{ secrets.CUSTOM_TOKEN }} + + + - name: Suggestions + if: failure() + run: | + echo -e "\nPlease review the links reported in the Check links step above." + echo -e "If a link is valid but fails due to a CAPTCHA challenge, IP blocking, login requirements, etc., consider adding such links to .lycheeignore file to bypass future checks.\n" + exit 1 diff --git a/.github/workflows/StalePRs.yml b/.github/workflows/StalePRs.yml new file mode 100644 index 00000000000..e7393948518 --- /dev/null +++ b/.github/workflows/StalePRs.yml @@ -0,0 +1,156 @@ +# A workflow copied from the pytorch/pytorch repo stale PRs that implements similar logic to actions/stale. +# +# Compared to actions/stale, it is implemented to make API requests proportional +# to the number of stale PRs, not the total number of issues in the repo. This +# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into +# rate limits way too quickly. +# +# The behavior is: +# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it. +# - If a PR is labeled stale, after 30 days inactivity close the PR. +# - `high priority` and `no-stale` PRs are exempt. + +name: Close stale pull requests + +on: + schedule: + # Run at midnight UTC. + - cron: '0 0 * * *' + workflow_dispatch: + +jobs: + stale: + if: ${{ github.repository == 'pytorch/tutorials' }} + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + + steps: + - uses: actions/github-script@v6 + with: + script: | + // Do some dumb retries on requests. + const retries = 7; + const baseBackoff = 100; + const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout)); + github.hook.wrap('request', async (request, options) => { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await request(options); + } catch (err) { + if (attempt < retries) { + core.warning(`Request getting retried. Attempt: ${attempt}`); + await sleep(baseBackoff * Math.pow(2, attempt)); + continue; + } + throw err; + } + } + }); + + const MAX_API_REQUESTS = 100; + + // If a PRs not labeled stale, label them stale after no update for 60 days. + const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60; + // For PRs already labeled stale, close after not update for 30 days. + const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30; + + const STALE_MESSAGE = + "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `stale`.
" + + "Feel free to remove the `stale` label if you feel this was a mistake.
" + + "If you are unable to remove the `stale` label please contact a maintainer in order to do so.
" + + "If you want the bot to never mark this PR stale again, add the `no-stale` label.
" + + "`stale` pull requests will automatically be closed after 30 days of inactivity.
"; + + let numAPIRequests = 0; + let numProcessed = 0; + + async function processPull(pull) { + core.info(`[${pull.number}] URL: ${pull.html_url}`); + numProcessed += 1; + const labels = pull.labels.map((label) => label.name); + + // Skip if certain labels are present. + if (labels.includes("no-stale") || labels.includes("high priority")) { + core.info(`[${pull.number}] Skipping because PR has an exempting label.`); + return false; + } + + // Check if the PR is stale, according to our configured thresholds. + let staleThresholdMillis; + if (labels.includes("stale")) { + core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`); + staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS; + } else { + core.info(`[${pull.number}] Checking whether to label PR as stale.`); + staleThresholdMillis = STALE_LABEL_THRESHOLD_MS; + } + + const millisSinceLastUpdated = + new Date().getTime() - new Date(pull.updated_at).getTime(); + + if (millisSinceLastUpdated < staleThresholdMillis) { + core.info(`[${pull.number}] Skipping because PR was updated recently`); + return false; + } + + // At this point, we know we should do something. + // For PRs already labeled stale, close them. + if (labels.includes("stale")) { + core.info(`[${pull.number}] Closing PR.`); + numAPIRequests += 1; + await github.rest.issues.update({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + state: "closed", + }); + } else { + // For PRs not labeled stale, label them stale. + core.info(`[${pull.number}] Labeling PR as stale.`); + + numAPIRequests += 1; + await github.rest.issues.createComment({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + body: STALE_MESSAGE, + }); + + numAPIRequests += 1; + await github.rest.issues.addLabels({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + labels: ["stale"], + }); + } + } + + for await (const response of github.paginate.iterator( + github.rest.pulls.list, + { + owner: "pytorch", + repo: "tutorials", + state: "open", + sort: "created", + direction: "asc", + per_page: 100, + } + )) { + numAPIRequests += 1; + const pulls = response.data; + // Awaiting in a loop is intentional here. We want to serialize execution so + // that log groups are printed correctl + for (const pull of pulls) { + if (numAPIRequests > MAX_API_REQUESTS) { + core.warning("Max API requests exceeded, exiting."); + process.exit(0); + } + await core.group(`Processing PR #${pull.number}`, async () => { + await processPull(pull); + }); + } + } + core.info(`Processed ${numProcessed} PRs total.`); diff --git a/.github/workflows/_build-tutorials-base.yml b/.github/workflows/_build-tutorials-base.yml new file mode 100644 index 00000000000..e33c8c4bd67 --- /dev/null +++ b/.github/workflows/_build-tutorials-base.yml @@ -0,0 +1,194 @@ +name: Build tutorials + +on: + workflow_call: + inputs: + USE_NIGHTLY: + description: "Use nightly builds inside build.sh" + required: false + type: number + default: 0 + UPLOAD: + description: "Upload built docs to PR preview and main site" + required: false + type: number + default: 0 + +jobs: + worker: + name: pytorch_tutorial_build_worker + strategy: + matrix: + include: + - { shard: 1, num_shards: 15, runner: "linux.g5.12xlarge.nvidia.gpu" } + - { shard: 2, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 3, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 4, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 5, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 6, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 7, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 8, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 9, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 10, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 11, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 12, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 13, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 14, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 15, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + NUM_WORKERS: ${{ matrix.num_shards }} + WORKER_ID: ${{ matrix.shard }} + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: worker + COMMIT_SOURCE: ${{ github.ref }} + USE_NIGHTLY: ${{ inputs.USE_NIGHTLY }} + UPLOAD: ${{ inputs.UPLOAD }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + -e USE_NIGHTLY \ + -e UPLOAD \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --shm-size=2gb \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ + "${DOCKER_IMAGE}" + ) + + docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + + manager: + name: pytorch_tutorial_build_manager + needs: worker + runs-on: [self-hosted, linux.2xlarge] + if: ${{ inputs.UPLOAD == 1 }} + environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }} + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + NUM_WORKERS: 15 + WORKER_ID: 0 + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: manager + COMMIT_SOURCE: ${{ github.ref }} + GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }} + USE_NIGHTLY: ${{ inputs.USE_NIGHTLY }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + -e GITHUB_PYTORCHBOT_TOKEN \ + -e USE_NIGHTLY \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ + "${DOCKER_IMAGE}" + ) + + docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Upload docs preview + uses: seemethere/upload-artifact-s3@v5 + if: ${{ github.event_name == 'pull_request' }} + with: + retention-days: 14 + s3-bucket: doc-previews + if-no-files-found: error + path: docs + s3-prefix: pytorch/tutorials/${{ github.event.pull_request.number }} + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.github/workflows/build-tutorials-nightly.yml b/.github/workflows/build-tutorials-nightly.yml new file mode 100644 index 00000000000..60f8d3a1fdc --- /dev/null +++ b/.github/workflows/build-tutorials-nightly.yml @@ -0,0 +1,35 @@ +name: Build tutorials (nightly/test) +# This is a workflow to build tutorials using nightly or the test/release +# candidate builds for pytorch libraries. It downloads torch and other torch +# related libraries from the nightly or test channel and checks that the +# tutorials can run. This workflow will not upload the built docs anywhere in +# order to prevent polluting the official documentation. + +# During releases, this workflow should be run on PRs to verify that the +# tutorials work with the test/rc builds before the official release is made. +# When there is no release candidate, this workflow should only be run on the +# main branch since nightly can be unstable and we do not want to block PRs due +# to failures in this workflow. + +# To change the channel between nightly and test/rc, change the index used to +# download the binaries in .jenkins/build.sh. +on: + # Only main branch for now. Uncomment the below line to enable it on PRs. + # pull_request: + + # Comment out the below line to disable on the main branch + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + build: + uses: ./.github/workflows/_build-tutorials-base.yml + secrets: inherit + with: + USE_NIGHTLY: 1 + UPLOAD: 0 diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml index ffee49f4a76..58372d557e6 100644 --- a/.github/workflows/build-tutorials.yml +++ b/.github/workflows/build-tutorials.yml @@ -11,169 +11,9 @@ concurrency: cancel-in-progress: true jobs: - worker: - name: pytorch_tutorial_build_worker - strategy: - matrix: - include: - - { shard: 1, num_shards: 15, runner: "linux.g5.12xlarge.nvidia.gpu" } - - { shard: 2, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - - { shard: 3, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - - { shard: 4, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - - { shard: 5, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - - { shard: 6, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - - { shard: 7, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 8, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 9, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 10, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 11, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 12, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 13, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 14, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - - { shard: 15, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - instructions: | - All testing is done inside the container, to start an interactive session run: - docker exec -it $(docker container ps --format '{{.ID}}') bash - - - name: Checkout Tutorials - uses: actions/checkout@v3 - - - name: Setup Linux - uses: pytorch/pytorch/.github/actions/setup-linux@main - - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - - - name: Calculate/build docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-image-name: tutorials - - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - - name: Build - shell: bash - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - NUM_WORKERS: ${{ matrix.num_shards }} - WORKER_ID: ${{ matrix.shard }} - COMMIT_ID: ${{ github.sha }} - JOB_TYPE: worker - COMMIT_SOURCE: ${{ github.ref }} - run: | - set -ex - - chmod +x ".jenkins/build.sh" - - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e WORKER_ID \ - -e NUM_WORKERS \ - -e COMMIT_ID \ - -e JOB_TYPE \ - -e COMMIT_SOURCE \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --tty \ - --detach \ - --shm-size=2gb \ - --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ - -w /var/lib/workspace \ - "${DOCKER_IMAGE}" - ) - - docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" - - - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() - - manager: - name: pytorch_tutorial_build_manager - needs: worker - runs-on: [self-hosted, linux.2xlarge] - environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }} - steps: - - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - instructions: | - All testing is done inside the container, to start an interactive session run: - docker exec -it $(docker container ps --format '{{.ID}}') bash - - - name: Checkout Tutorials - uses: actions/checkout@v3 - - - name: Setup Linux - uses: pytorch/pytorch/.github/actions/setup-linux@main - - - name: Calculate/build docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-image-name: tutorials - - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - - name: Build - shell: bash - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - NUM_WORKERS: 15 - WORKER_ID: ${{ matrix.shard }} - COMMIT_ID: ${{ github.sha }} - JOB_TYPE: manager - COMMIT_SOURCE: ${{ github.ref }} - GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }} - run: | - set -ex - - chmod +x ".jenkins/build.sh" - - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e WORKER_ID \ - -e NUM_WORKERS \ - -e COMMIT_ID \ - -e JOB_TYPE \ - -e COMMIT_SOURCE \ - -e GITHUB_PYTORCHBOT_TOKEN \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --tty \ - --detach \ - --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ - -w /var/lib/workspace \ - "${DOCKER_IMAGE}" - ) - - docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" - - - name: Upload docs preview - uses: seemethere/upload-artifact-s3@v5 - if: ${{ github.event_name == 'pull_request' }} - with: - retention-days: 14 - s3-bucket: doc-previews - if-no-files-found: error - path: docs - s3-prefix: pytorch/tutorials/${{ github.event.pull_request.number }} - - - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() + build: + uses: ./.github/workflows/_build-tutorials-base.yml + secrets: inherit + with: + USE_NIGHTLY: 0 + UPLOAD: 1 diff --git a/.github/workflows/check-redirects.yml b/.github/workflows/check-redirects.yml new file mode 100644 index 00000000000..380e3989bf4 --- /dev/null +++ b/.github/workflows/check-redirects.yml @@ -0,0 +1,25 @@ +name: Check Redirects for Deleted or Renamed Files + +on: + pull_request: + paths: + - '*/**/*.rst' + - '*/**/*.py' + - '*/**/*.md' + +jobs: + check-redirects: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run redirect check script + run: | + chmod +x ./.github/scripts/check_redirects.sh + ./.github/scripts/check_redirects.sh + env: + BASE_BRANCH: ${{ github.base_ref }} + CURRENT_BRANCH: ${{ github.head_ref }} diff --git a/.github/workflows/docathon-assign.yml b/.github/workflows/docathon-assign.yml index 31fa28289b0..8eef2b2fc88 100644 --- a/.github/workflows/docathon-assign.yml +++ b/.github/workflows/docathon-assign.yml @@ -28,14 +28,14 @@ jobs: repo: context.repo.repo, issue_number: issueNumber }); - const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024'); + const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2025'); if (hasLabel) { if (issue.assignee !== null) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, - body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." + body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)." }); } else { await github.rest.issues.addAssignees({ @@ -46,7 +46,7 @@ jobs: }); } } else { - const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." + const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)." await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/.github/workflows/link_checkPR.yml b/.github/workflows/link_checkPR.yml new file mode 100644 index 00000000000..830e470c1c0 --- /dev/null +++ b/.github/workflows/link_checkPR.yml @@ -0,0 +1,57 @@ +#Checks links in a PR to ensure they are valid. If link is valid but failing, it can be added to the .lycheeignore file +#Use the skip-link-check label on a PR to skip checking links on a PR + +name: link check on PR + +on: + pull_request: + branches: [main] + +jobs: + linkChecker: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Get Changed Files + id: changed-files + uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f # v45.0.6 + + - name: Check for Skip Label + id: skip-label + uses: actions/github-script@v6 + with: + script: | + const labels = await github.rest.issues.listLabelsOnIssue({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + return labels.data.some(label => label.name === 'skip-link-check'); + + - name: Check Links + if: steps.skip-label.outputs.result == 'false' + uses: lycheeverse/lychee-action@v1 + with: + args: --accept=200,403,429 --base . --verbose --no-progress ${{ steps.changed-files.outputs.all_changed_files }} + token: ${{ secrets.CUSTOM_TOKEN }} + fail: true + + - name: Skip Message + if: steps.skip-label.outputs.result == 'true' + run: echo "Link check was skipped due to the presence of the 'skip-link-check' label." + + # Per tj-actions, a delete file is not a changed file so this ensures lint checking does not occur on deleted files + - name: No Files to Check + if: steps.skip-label.outputs.result == 'false' && steps.changed-files.outputs.any_changed == 'true' + run: echo "No relevant files were changed in this PR that require link checking." + + - name: Suggestions + if: failure() + run: | + echo -e "\nPlease review the links reported in the Check links step above." + echo -e "If a link is valid but fails due to a CAPTCHA challenge, IP blocking, login requirements, etc., consider adding such links to .lycheeignore file to bypass future checks.\n" + exit 1 diff --git a/.github/workflows/lintrunner.yml b/.github/workflows/lintrunner.yml new file mode 100644 index 00000000000..e1a6889eb28 --- /dev/null +++ b/.github/workflows/lintrunner.yml @@ -0,0 +1,38 @@ +name: Lintrunner + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + lintrunner: + name: lintrunner + runs-on: ubuntu-latest + steps: + - name: Checkout tutorials + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Setup Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.12' + + - name: Install Lintrunner + run: | + pip install lintrunner==0.12.5 + lintrunner init + + - name: Run lintrunner on all files - Linux + run: | + set +e + if ! lintrunner -v --force-color --all-files --tee-json=lint.json; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m main\`.\e[0m" + exit 1 + fi diff --git a/.github/workflows/spelling.yml b/.github/workflows/spelling.yml index 07c86ed4a28..e1cba836c96 100644 --- a/.github/workflows/spelling.yml +++ b/.github/workflows/spelling.yml @@ -5,16 +5,149 @@ on: push: branches: - main + jobs: pyspelling: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - name: Check for skip label and get changed files + id: check-files + uses: actions/github-script@v6 + with: + script: | + let skipCheck = false; + let changedFiles = []; + + if (context.eventName === 'pull_request') { + // Check for skip label + const { data: labels } = await github.rest.issues.listLabelsOnIssue({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + skipCheck = labels.some(label => label.name === 'skip-spell-check'); + + if (!skipCheck) { + // Get changed files in PR + const { data: files } = await github.rest.pulls.listFiles({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number + }); + + changedFiles = files + .filter(file => file.filename.match(/\.(py|rst|md)$/)) + .map(file => file.filename); + } + } else { + // For push events, we'll still need to use git diff + // We'll handle this after checkout + } + + core.setOutput('skip', skipCheck.toString()); + core.setOutput('files', changedFiles.join('\n')); + core.setOutput('is-pr', (context.eventName === 'pull_request').toString()); + + - uses: actions/checkout@v4 + if: steps.check-files.outputs.skip != 'true' + with: + fetch-depth: 0 + + - name: Get changed files for push event + if: | + steps.check-files.outputs.skip != 'true' && + steps.check-files.outputs.is-pr != 'true' + id: push-files + run: | + CHANGED_FILES=$(git diff --name-only HEAD^..HEAD -- '*.py' '*.rst' '*.md') + echo "files<> $GITHUB_OUTPUT + echo "$CHANGED_FILES" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + - name: Check if relevant files changed + if: steps.check-files.outputs.skip != 'true' + id: check + run: | + if [ "${{ steps.check-files.outputs.is-pr }}" == "true" ]; then + FILES="${{ steps.check-files.outputs.files }}" + else + FILES="${{ steps.push-files.outputs.files }}" + fi + + if [ -z "$FILES" ]; then + echo "skip=true" >> $GITHUB_OUTPUT + echo "No relevant files changed (*.py, *.rst, *.md), skipping spell check" + else + echo "skip=false" >> $GITHUB_OUTPUT + echo "Found changed files to check:" + echo "$FILES" + fi + - uses: actions/setup-python@v4 + if: | + steps.check-files.outputs.skip != 'true' && + steps.check.outputs.skip != 'true' with: python-version: '3.9' cache: 'pip' - - run: pip install pyspelling - - run: sudo apt-get install aspell aspell-en - - run: pyspelling + - name: Install dependencies + if: | + steps.check-files.outputs.skip != 'true' && + steps.check.outputs.skip != 'true' + run: | + pip install pyspelling + sudo apt-get install aspell aspell-en + + - name: Run spell check on each file + id: spellcheck + if: | + steps.check-files.outputs.skip != 'true' && + steps.check.outputs.skip != 'true' + run: | + if [ "${{ steps.check-files.outputs.is-pr }}" == "true" ]; then + mapfile -t FILES <<< "${{ steps.check-files.outputs.files }}" + else + mapfile -t FILES <<< "${{ steps.push-files.outputs.files }}" + fi + + # Check each file individually + FINAL_EXIT_CODE=0 + SPELLCHECK_LOG="" + for file in "${FILES[@]}"; do + if [ -n "$file" ]; then + echo "Checking spelling in $file" + python3 -c "import yaml; config = yaml.safe_load(open('.pyspelling.yml')); new_matrix = [matrix.copy() for matrix in config['matrix'] if (('python' in matrix['name'].lower() and '$file'.endswith('.py')) or ('rest' in matrix['name'].lower() and '$file'.endswith('.rst')) or ('markdown' in matrix['name'].lower() and '$file'.endswith('.md'))) and not matrix.update({'sources': ['$file']})]; config['matrix'] = new_matrix; yaml.dump(config, open('temp_config.yml', 'w'))" + + if OUTPUT=$(pyspelling -c temp_config.yml 2>&1); then + echo "No spelling errors found in $file" + else + FINAL_EXIT_CODE=1 + echo "Spelling errors found in $file:" + echo "$OUTPUT" + SPELLCHECK_LOG+="### $file\n$OUTPUT\n\n" + fi + fi + done + + # Save the results to GITHUB_OUTPUT + echo "spell_failed=$FINAL_EXIT_CODE" >> $GITHUB_OUTPUT + echo "spell_log<> $GITHUB_OUTPUT + echo "$SPELLCHECK_LOG" >> $GITHUB_OUTPUT + echo "SPELLEOF" >> $GITHUB_OUTPUT + + if [ $FINAL_EXIT_CODE -ne 0 ]; then + echo "Spell check failed! See above for details." + echo + echo "Here are a few tips:" + echo "- All PyTorch API objects must be in double backticks or use an intersphinx directive." + echo " Example: ``torch.nn``, :func:" + echo "- Consult en-wordlist.txt for spellings of some of the words." + echo " You can add a word to en-wordlist.txt if:" + echo " 1) It's a common abbreviation, like RNN." + echo " 2) It's a word widely accepted in the industry." + echo "- Please do not add words like 'dtype', 'torch.nn.Transformer' to pass spellcheck." + echo " Instead wrap it in double backticks or use an intersphinx directive." + echo + exit 1 + fi diff --git a/.gitignore b/.gitignore index 1d9d572e565..3f1f927ee33 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ advanced pytorch_basics /recipes prototype +/unstable +sg_execution_times.rst #data things _data/ @@ -127,3 +129,6 @@ cleanup.sh # pyspelling dictionary.dic + +# linters +/.lintbin diff --git a/.gitmodules b/.gitmodules index 3a3c564c8fa..e69de29bb2d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "src/pytorch-sphinx-theme"] - path = src/pytorch-sphinx-theme - url = https://github.com/pytorch/pytorch_sphinx_theme diff --git a/.jenkins/build.sh b/.jenkins/build.sh index cf3da8461e5..32ceec660da 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -19,15 +19,21 @@ sudo apt-get install -y pandoc # NS: Path to python runtime should already be part of docker container # export PATH=/opt/conda/bin:$PATH -#Install PyTorch Nightly for test. -# Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html -# Install 2.2 for testing - uncomment to install nightly binaries (update the version as needed). -# pip uninstall -y torch torchvision torchaudio torchtext torchdata -# pip3 install torch==2.3.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu121 +# Install PyTorch Nightly for test. +if [ "${USE_NIGHTLY:-0}" -eq 1 ]; then + sudo pip uninstall -y torch torchvision torchaudio + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 + pip show torch +fi +# Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html +# Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed). +# sudo pip uninstall -y fbgemm-gpu torchrec +# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict +# sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124 +# pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126 # Install two language tokenizers for Translation with TorchText tutorial -python -m spacy download en_core_web_sm -python -m spacy download de_core_news_sm +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl awsv2 -i awsv2 configure set default.s3.multipart_threshold 5120MB @@ -114,8 +120,10 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then python .jenkins/validate_tutorials_built.py # Step 6: Copy generated files to S3, tag with commit ID - 7z a worker_${WORKER_ID}.7z docs - awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z + if [ "${UPLOAD:-0}" -eq 1 ]; then + 7z a worker_${WORKER_ID}.7z docs + awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z + fi elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 1: Generate no-plot HTML pages for all tutorials pip3 install -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme @@ -150,6 +158,12 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 7: push new HTML files and static files to gh-pages if [[ "$COMMIT_SOURCE" == "refs/heads/master" || "$COMMIT_SOURCE" == "refs/heads/main" ]]; then git clone https://github.com/pytorch/tutorials.git -b gh-pages gh-pages + # Clean up directories that contain tutorials + + for dir in beginner intermediate prototype recipes advanced distributed vision text audio; do + rm -rf "gh-pages/$dir" + done + cp -r docs/* gh-pages/ pushd gh-pages # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! diff --git a/.jenkins/download_data.py b/.jenkins/download_data.py index cc07c72561b..939e63fc7a8 100644 --- a/.jenkins/download_data.py +++ b/.jenkins/download_data.py @@ -12,7 +12,7 @@ BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data" INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data" ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data" -PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "prototype_source" / "data" +PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "unstable_source" / "data" FILES_TO_RUN = os.getenv("FILES_TO_RUN") @@ -106,7 +106,7 @@ def download_lenet_mnist() -> None: ) def download_gpu_quantization_torchao() -> None: - # Download SAM model checkpoint for prototype_source/gpu_quantization_torchao_tutorial.py + # Download SAM model checkpoint unstable_source/gpu_quantization_torchao_tutorial.py download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", prefix=PROTOTYPE_DATA_DIR, dst="sam_vit_h_4b8939.pth", diff --git a/.jenkins/insert_last_verified.py b/.jenkins/insert_last_verified.py new file mode 100644 index 00000000000..b43ef8de8e8 --- /dev/null +++ b/.jenkins/insert_last_verified.py @@ -0,0 +1,160 @@ +import json +import os +import subprocess +import sys +from datetime import datetime + +from bs4 import BeautifulSoup + + +json_file_path = "tutorials-review-data.json" + +# paths to skip from the post-processing script +paths_to_skip = [ + "beginner/examples_autograd/two_layer_net_custom_function", # not present in the repo + "beginner/examples_nn/two_layer_net_module", # not present in the repo + "beginner/examples_tensor/two_layer_net_numpy", # not present in the repo + "beginner/examples_tensor/two_layer_net_tensor", # not present in the repo + "beginner/examples_autograd/two_layer_net_autograd", # not present in the repo + "beginner/examples_nn/two_layer_net_optim", # not present in the repo + "beginner/examples_nn/two_layer_net_nn", # not present in the repo + "intermediate/coding_ddpg", # not present in the repo - will delete the carryover +] +# Mapping of source directories to build directories +source_to_build_mapping = { + "beginner": "beginner_source", + "recipes": "recipes_source", + "distributed": "distributed", + "intermediate": "intermediate_source", + "prototype": "prototype_source", + "advanced": "advanced_source", + "": "", # root dir for index.rst +} + +def get_git_log_date(file_path, git_log_args): + try: + result = subprocess.run( + ["git", "log"] + git_log_args + ["--", file_path], + capture_output=True, + text=True, + check=True, + ) + if result.stdout: + date_str = result.stdout.splitlines()[0] + return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z") + except subprocess.CalledProcessError: + pass + raise ValueError(f"Could not find date for {file_path}") + +def get_creation_date(file_path): + return get_git_log_date(file_path, ["--diff-filter=A", "--format=%aD"]).strftime("%b %d, %Y") + + +def get_last_updated_date(file_path): + return get_git_log_date(file_path, ["-1", "--format=%aD"]).strftime("%b %d, %Y") + +# Try to find the source file with the given base path and the extensions .rst and .py +def find_source_file(base_path): + for ext in [".rst", ".py"]: + source_file_path = base_path + ext + if os.path.exists(source_file_path): + return source_file_path + return None + + +# Function to process a JSON file and insert the "Last Verified" information into the HTML files +def process_json_file(build_dir , json_file_path): + with open(json_file_path, "r", encoding="utf-8") as json_file: + json_data = json.load(json_file) + + for entry in json_data: + path = entry["Path"] + last_verified = entry["Last Verified"] + status = entry.get("Status", "") + if path in paths_to_skip: + print(f"Skipping path: {path}") + continue + if status in ["needs update", "not verified"]: + formatted_last_verified = "Not Verified" + elif last_verified: + try: + last_verified_date = datetime.strptime(last_verified, "%Y-%m-%d") + formatted_last_verified = last_verified_date.strftime("%b %d, %Y") + except ValueError: + formatted_last_verified = "Unknown" + else: + formatted_last_verified = "Not Verified" + if status == "deprecated": + formatted_last_verified += "Deprecated" + + for build_subdir, source_subdir in source_to_build_mapping.items(): + if path.startswith(build_subdir): + html_file_path = os.path.join(build_dir, path + ".html") + base_source_path = os.path.join( + source_subdir, path[len(build_subdir) + 1 :] + ) + source_file_path = find_source_file(base_source_path) + break + else: + print(f"Warning: No mapping found for path {path}") + continue + + if not os.path.exists(html_file_path): + print( + f"Warning: HTML file not found for path {html_file_path}." + "If this is a new tutorial, please add it to the audit JSON file and set the Verified status and todays's date." + ) + continue + + if not source_file_path: + print(f"Warning: Source file not found for path {base_source_path}.") + continue + + created_on = get_creation_date(source_file_path) + last_updated = get_last_updated_date(source_file_path) + + with open(html_file_path, "r", encoding="utf-8") as file: + soup = BeautifulSoup(file, "html.parser") + # Check if the

tag with class "date-info-last-verified" already exists + existing_date_info = soup.find("p", {"class": "date-info-last-verified"}) + if existing_date_info: + print( + f"Warning:

tag with class 'date-info-last-verified' already exists in {html_file_path}" + ) + continue + + h1_tag = soup.find("h1") # Find the h1 tag to insert the dates + if h1_tag: + date_info_tag = soup.new_tag("p", **{"class": "date-info-last-verified"}) + date_info_tag["style"] = "color: #6c6c6d; font-size: small;" + # Add the "Created On", "Last Updated", and "Last Verified" information + date_info_tag.string = ( + f"Created On: {created_on} | " + f"Last Updated: {last_updated} | " + f"Last Verified: {formatted_last_verified}" + ) + # Insert the new tag after the

tag + h1_tag.insert_after(date_info_tag) + # Save back to the HTML. + with open(html_file_path, "w", encoding="utf-8") as file: + file.write(str(soup)) + else: + print(f"Warning:

tag not found in {html_file_path}") + + +def main(): + if len(sys.argv) < 2: + print("Error: Build directory not provided. Exiting.") + exit(1) + build_dir = sys.argv[1] + print(f"Build directory: {build_dir}") + process_json_file(build_dir , json_file_path) + print( + "Finished processing JSON file. Please check the output for any warnings. " + "Pages like `nlp/index.html` are generated only during the full `make docs` " + "or `make html` build. Warnings about these files when you run `make html-noplot` " + "can be ignored." + ) + +if __name__ == "__main__": + main() diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 4814f9a7d2b..6e82d054b4e 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,6 +28,12 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, + "intermediate_source/torchrec_intro_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "recipes_source/torch_export_aoti_python.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "advanced_source/pendulum.py": { "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." @@ -52,9 +58,15 @@ "intermediate_source/scaled_dot_product_attention_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, + "intermediate_source/transformer_building_blocks.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, + "recipes_source/regional_compilation.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "advanced_source/semi_structured_sparse.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, diff --git a/.jenkins/post_process_notebooks.py b/.jenkins/post_process_notebooks.py index 81f51766c3e..d10eb5a1bcc 100644 --- a/.jenkins/post_process_notebooks.py +++ b/.jenkins/post_process_notebooks.py @@ -5,14 +5,14 @@ """ This post-processing script needs to run after the .ipynb files are generated. The script removes extraneous ```{=html} syntax from the -admonitions and splits the cells that have video iframe into a +admonitions and splits the cells that have video iframe into a separate code cell that can be run to load the video directly in the notebook. This script is included in build.sh. """ # Pattern to search ``` {.python .jupyter-code-cell} -pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL) +pattern = re.compile(r'(.*?)``` {\.python \.jupyter-code-cell}\n(.*?from IPython\.display import display, HTML.*?display\(HTML\(html_code\)\))\n```(.*)', re.DOTALL) def process_video_cell(notebook_path): @@ -36,7 +36,7 @@ def process_video_cell(notebook_path): before_html_block = match.group(1) code_block = match.group(2) - # Add a comment to run the cell to display the video + # Add a comment to run the cell to display the video code_block = "# Run this cell to load the video\n" + code_block # Create a new code cell new_code_cell = nbf.v4.new_code_cell(source=code_block) diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index 03101df481c..75dd51dd789 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -10,40 +10,26 @@ NOT_RUN = [ "beginner_source/basics/intro", # no code + "beginner_source/introyt/introyt_index", # no code "beginner_source/onnx/intro_onnx", - "beginner_source/translation_transformer", "beginner_source/profiler", "beginner_source/saving_loading_models", "beginner_source/introyt/captumyt", "beginner_source/examples_nn/polynomial_module", "beginner_source/examples_nn/dynamic_net", "beginner_source/examples_nn/polynomial_optim", - "beginner_source/former_torchies/autograd_tutorial_old", - "beginner_source/former_torchies/tensor_tutorial_old", "beginner_source/examples_autograd/polynomial_autograd", "beginner_source/examples_autograd/polynomial_custom_function", - "beginner_source/torchtext_custom_dataset_tutorial", # not building with 2.3 RC, might be able to turn on with GA - "beginner_source/text_sentiment_ngrams_tutorial", # not building with 2.3 RC, might be able to turn on with GA - "beginner_source/t5_tutorial", # re-enable after this is fixed: https://github.com/pytorch/text/issues/1756 + "intermediate_source/dqn_with_rnn_tutorial", #not working on 2.8 release reenable after 3514 "intermediate_source/mnist_train_nas", # used by ax_multiobjective_nas_tutorial.py - "intermediate_source/fx_conv_bn_fuser", + "intermediate_source/torch_compile_conv_bn_fuser", "intermediate_source/_torch_export_nightly_tutorial", # does not work on release - "advanced_source/super_resolution_with_onnxruntime", - "advanced_source/python_custom_ops", # https://github.com/pytorch/pytorch/issues/127443 "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker - "prototype_source/fx_graph_mode_ptq_dynamic", - "prototype_source/vmap_recipe", - "prototype_source/torchscript_freezing", - "prototype_source/nestedtensor", - "recipes_source/recipes/saving_and_loading_models_for_inference", - "recipes_source/recipes/saving_multiple_models_in_one_file", + "unstable_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up "recipes_source/recipes/tensorboard_with_pytorch", "recipes_source/recipes/what_is_state_dict", "recipes_source/recipes/profiler_recipe", - "recipes_source/recipes/save_load_across_devices", "recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model", - "recipes_source/recipes/dynamic_quantization", - "recipes_source/recipes/saving_and_loading_a_general_checkpoint", "recipes_source/recipes/benchmark", "recipes_source/recipes/tuning_guide", "recipes_source/recipes/zeroing_out_gradients", @@ -51,12 +37,9 @@ "recipes_source/recipes/timer_quick_start", "recipes_source/recipes/amp_recipe", "recipes_source/recipes/Captum_Recipe", - "intermediate_source/flask_rest_api_tutorial", - "intermediate_source/text_to_speech_with_torchaudio", "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. - "intermediate_source/inductor_debug_cpu", # reenable after 2942 - "beginner_source/onnx/onnx_registry_tutorial", # reenable after 2941 is fixed. - "intermediate_source/torch_export_tutorial" # reenable after 2940 is fixed. + "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed. + "intermediate_source/torchrec_intro_tutorial.py", #failing with 2.8 reenable after 3498 ] def tutorial_source_dirs() -> List[Path]: diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 00000000000..d3a1cbd9885 --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,225 @@ +merge_base_with = "origin/main" + +# 4805a6ead6f1e7f32351056e2602be4e908f69b7 is from pytorch/pytorch main branch 2025-07-16 + +[[linter]] +code = 'SPACES' +include_patterns = ['**'] +exclude_patterns = [ + "_static/**/*", # Contains some files that should usually not be linted + # All files below this should be checked and either removed from the + # exclusion list by fixing them or have a reason to be excluded. + "advanced_source/coding_ddpg.py", + "advanced_source/cpp_autograd.rst", + "advanced_source/cpp_custom_ops.rst", + "advanced_source/generic_join.rst", + "advanced_source/neural_style_tutorial.py", + "advanced_source/pendulum.py", + "advanced_source/privateuseone.rst", + "advanced_source/semi_structured_sparse.py", + "advanced_source/sharding.rst", + "advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py", + "advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp", + "advanced_source/usb_semisup_learn.py", + "beginner_source/blitz/README.txt", + "beginner_source/blitz/neural_networks_tutorial.py", + "beginner_source/dcgan_faces_tutorial.py", + "beginner_source/ddp_series_fault_tolerance.rst", + "beginner_source/ddp_series_theory.rst", + "beginner_source/examples_nn/polynomial_module.py", + "beginner_source/examples_nn/polynomial_nn.py", + "beginner_source/hta_intro_tutorial.rst", + "beginner_source/hta_trace_diff_tutorial.rst", + "beginner_source/hybrid_frontend/README.txt", + "beginner_source/hybrid_frontend_tutorial.rst", + "beginner_source/hyperparameter_tuning_tutorial.py", + "beginner_source/introyt/README.txt", + "beginner_source/introyt/autogradyt_tutorial.py", + "beginner_source/introyt/captumyt.py", + "beginner_source/introyt/introyt1_tutorial.py", + "beginner_source/introyt/modelsyt_tutorial.py", + "beginner_source/introyt/tensorboardyt_tutorial.py", + "beginner_source/introyt/tensors_deeper_tutorial.py", + "beginner_source/introyt/trainingyt.py", + "beginner_source/knowledge_distillation_tutorial.py", + "beginner_source/nlp/sequence_models_tutorial.py", + "beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py", + "beginner_source/onnx/onnx_registry_tutorial.py", + "beginner_source/pytorch_with_examples.rst", + "beginner_source/saving_loading_models.py", + "beginner_source/template_tutorial.py", + "beginner_source/transfer_learning_tutorial.py", + "intermediate_source/TCPStore_libuv_backend.rst", + "intermediate_source/ax_multiobjective_nas_tutorial.py", + "intermediate_source/compiled_autograd_tutorial.rst", + "intermediate_source/ddp_series_multinode.rst", + "intermediate_source/dqn_with_rnn_tutorial.py", + "intermediate_source/fx_profiling_tutorial.py", + "intermediate_source/inductor_debug_cpu.py", + "intermediate_source/jacobians_hessians.py", + "intermediate_source/optimizer_step_in_backward_tutorial.py", + "intermediate_source/per_sample_grads.py", + "intermediate_source/pruning_tutorial.py", + "intermediate_source/reinforcement_q_learning.py", + "intermediate_source/tensorboard_profiler_tutorial.py", + "intermediate_source/torch_compile_tutorial.py", + "intermediate_source/transformer_building_blocks.py", + "unstable_source/README.md", + "unstable_source/README.txt", + "unstable_source/gpu_direct_storage.py", + "unstable_source/inductor_cpp_wrapper_tutorial.rst", + "unstable_source/inductor_windows.rst", + "unstable_source/maskedtensor_advanced_semantics.py", + "unstable_source/max_autotune_on_CPU_tutorial.rst", + "unstable_source/vmap_recipe.py", + "recipes_source/README.txt", + "recipes_source/compiling_optimizer.rst", + "recipes_source/compiling_optimizer_lr_scheduler.py", + "recipes_source/distributed_optim_torchscript.rst", + "recipes_source/foreach_map.py", + "recipes_source/profile_with_itt.rst", + "recipes_source/recipes/Captum_Recipe.py", + "recipes_source/recipes/benchmark.py", + "recipes_source/recipes/changing_default_device.py", + "recipes_source/recipes/defining_a_neural_network.py", + "recipes_source/recipes/tensorboard_with_pytorch.py", + "recipes_source/recipes/timer_quick_start.py", + "recipes_source/recipes/tuning_guide.py", + "recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py", + "recipes_source/recipes/what_is_state_dict.py", + "recipes_source/torch_compile_caching_tutorial.rst", + "recipes_source/torch_compile_torch_function_modes.py", + "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py", + "recipes_source/torch_compiler_set_stance_tutorial.py", + "recipes_source/torch_export_aoti_python.py", + "recipes_source/xeon_run_cpu.rst", + "advanced_source/cpp_export.rst", + "advanced_source/torch-script-parallelism.rst", + "advanced_source/torch_script_custom_classes.rst", + "advanced_source/torch_script_custom_ops.rst", + "recipes_source/torchscript_inference.rst", +] +init_command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--lint-name=grep_linter.py', + '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/grep_linter.py', + '--', + '--dry-run={{DRYRUN}}', +] +command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--run-lint', + '--lint-name=grep_linter.py', + '--', + '--pattern=[[:blank:]]$', + '--linter-name=SPACES', + '--error-name=trailing spaces', + '--replace-pattern=s/[[:blank:]]+$//', + """--error-description=\ + This line has trailing spaces; please remove them.\ + """, + '--', + '@{{PATHSFILE}}' +] + +[[linter]] +code = 'TABS' +include_patterns = ['**'] +exclude_patterns = [ + "_static/**/*", # Contains some files that should usually not be linted + ".lintrunner.toml", # Ironically needs to contain the tab character to find in other files + "Makefile", # Wants tabs for indentationo + # All files below this should be checked and either removed from the + # exclusion list by fixing them or have a reason to be excluded. + "advanced_source/README.txt", + "advanced_source/cpp_frontend.rst", + "advanced_source/torch_script_custom_ops.rst", + "beginner_source/README.txt", + "beginner_source/basics/tensorqs_tutorial.py", + "beginner_source/blitz/README.txt", + "beginner_source/blitz/tensor_tutorial.py", + "beginner_source/hybrid_frontend/README.txt", + "beginner_source/nlp/README.txt", + "beginner_source/nlp/pytorch_tutorial.py", + "intermediate_source/README.txt", + "intermediate_source/TP_tutorial.rst", + "intermediate_source/inductor_debug_cpu.py", + "unstable_source/README.txt", + "recipes_source/README.txt", + "recipes_source/recipes/README.txt", + "recipes_source/xeon_run_cpu.rst", +] +init_command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--lint-name=grep_linter.py', + '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/grep_linter.py', + '--', + '--dry-run={{DRYRUN}}', +] +command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--run-lint', + '--lint-name=grep_linter.py', + '--', + # @lint-ignore TXT2 + '--pattern= ', + '--linter-name=TABS', + '--error-name=saw some tabs', + '--replace-pattern=s/\t/ /', + """--error-description=\ + This line has tabs; please replace them with spaces.\ + """, + '--', + '@{{PATHSFILE}}' +] + +[[linter]] +code = 'NEWLINE' +include_patterns=['**'] +exclude_patterns=[ + "_static/**/*", # Contains some files that should usually not be linted + # All files below this should be checked and either removed from the + # exclusion list by fixing them or have a reason to be excluded. + "advanced_source/extend_dispatcher.rst", + "advanced_source/neural_style_tutorial.py", + "advanced_source/sharding.rst", + "advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py", + "advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp", + "beginner_source/blitz/README.txt", + "beginner_source/dcgan_faces_tutorial.py", + "beginner_source/hta_trace_diff_tutorial.rst", + "beginner_source/hybrid_frontend/README.txt", + "beginner_source/nlp/pytorch_tutorial.py", + "beginner_source/template_tutorial.py", + "beginner_source/transfer_learning_tutorial.py", + "intermediate_source/custom_function_conv_bn_tutorial.py", + "intermediate_source/custom_function_double_backward_tutorial.rst", + "intermediate_source/forced_alignment_with_torchaudio_tutorial.rst", + "intermediate_source/nlp_from_scratch_index.rst", + "intermediate_source/pipeline_tutorial.rst", + "recipes_source/README.txt", + "recipes_source/script_optimized.rst", + "recipes_source/torch_compile_caching_configuration_tutorial.rst", + "recipes_source/torch_compile_caching_tutorial.rst", +] +init_command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--lint-name=newlines_linter.py', + '--lint-link=https://raw.githubusercontent.com/pytorch/pytorch/4805a6ead6f1e7f32351056e2602be4e908f69b7/tools/linter/adapters/newlines_linter.py', + '--', + '--dry-run={{DRYRUN}}', +] +command = [ + 'python3', + 'tools/linter/adapters/run_from_link.py', + '--run-lint', + '--lint-name=newlines_linter.py', + '--', + '@{{PATHSFILE}}', +] +is_formatter = true diff --git a/.lycheeignore b/.lycheeignore new file mode 100644 index 00000000000..fc1e3f1fa85 --- /dev/null +++ b/.lycheeignore @@ -0,0 +1,17 @@ +# Used for links to be ignored during the link check. +# Add link to file along with comment as to why it should be ignored + +#Example link in some of the tutorials that should be ignored +file:///f:/libtmp/some_file + +#Ignore links with "file:///" to catch any other example links +file:\/\/\/.* + +# Ignore colab link in the setting of conf.py +https://pytorch.org/tutorials/beginner/colab/n + +# Ignore local host link from intermediate_source/tensorboard_tutorial.rst +http://localhost:6006 + +# Ignore local host link from advanced_source/cpp_frontend.rst +https://www.uber.com/blog/deep-neuroevolution/ diff --git a/.pyspelling.yml b/.pyspelling.yml index 1afe6dbb45e..bce797e6559 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -2,10 +2,7 @@ spellchecker: aspell matrix: - name: python sources: - - beginner_source/*.py - - intermediate_source/*.py - - advanced_source/*.py - - recipes_source/*/*.py + - "**/*.py" dictionary: wordlists: - en-wordlist.txt @@ -56,7 +53,7 @@ matrix: - pyspelling.filters.url: - name: reST sources: - - beginner_source/*.rst + - "**/*.rst" dictionary: wordlists: - en-wordlist.txt @@ -119,3 +116,48 @@ matrix: - open: '\.\.\s+(image|include|only)::' close: '$' - pyspelling.filters.url: +- name: markdown + sources: + - '**/*.md' + dictionary: + wordlists: + - en-wordlist.txt + pipeline: + - pyspelling.filters.markdown: + markdown_extensions: + - markdown.extensions.extra: + - markdown.extensions.admonition: + - markdown.extensions.codehilite: + - markdown.extensions.meta: + - markdown.extensions.tables: + - markdown.extensions.toc: + - pyspelling.filters.html: + comments: false + ignores: + - code + - pre + - tt + - img + - a + - table + - thead + - tbody + - th + - tr + - td + - pyspelling.filters.context: + context_visible_first: true + delimiters: + # Ignore code blocks + - open: '```[a-z]*\n' + close: '```\n' + # Ignore inline code + - open: '`' + close: '`' + # Ignore links + - open: '\[([^]]*)\]' + close: '\([^)]*\)' + # Ignore HTML comments + - open: '' + - pyspelling.filters.url: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5a3ab42272c..0879eeebdff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -71,8 +71,7 @@ There are three types of tutorial content that we host on reStructuredText files. The build system only converts them into HTML; the code in them does not run on build. These tutorials are easier to create and maintain but they do not provide an interactive experience. - An example is the [Dynamic Quantization - tutorial](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html). + * **Recipes** are tutorials that provide bite-sized, actionable examples of how to use specific features, which differentiates them @@ -162,7 +161,7 @@ Write for a global audience with an instructive and directive voice. - PyTorch has a global audience; use clear, easy to understand language. Avoid idioms or other figures of speech. -- To keep your instructions concise, use +- To keep your instructions concise, use [active voice](https://writing.wisc.edu/handbook/style/ccs_activevoice/) as much as possible. - For a short guide on the essentials of writing style, [The Elements of Style](https://www.gutenberg.org/files/37134/37134-h/37134-h.htm) @@ -218,9 +217,8 @@ described in the preceding sections: - [NLP From Scratch: Generating Names with a Character-Level RNN Tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html) -If you are creating a recipe, we recommend that you use [this -template](https://github.com/pytorch/tutorials/blob/tutorials_refresh/recipes_source/recipes/example_recipe.py) -as a guide. +If you are creating a recipe, [this is a good +example.](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes/what_is_state_dict.py) # Submission Process # @@ -261,12 +259,12 @@ For Python files, our CI system runs your code during each build. In order for your tutorial to appear on the website, and through tag search, you need to include it in `index.rst`, or for recipes, in -`recipes_index.rst`. +`recipes_index.rst`. 1. Open the relevant file [`index.rst`](https://github.com/pytorch/tutorials/blob/main/index.rst) or - [`recipes_index.rst`](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes_index.rst) + [`recipes_index.rst`](https://github.com/pytorch/tutorials/blob/main/recipes_index.rst) 1. Add a _card_ in reStructuredText format similar to the following: ``` @@ -358,13 +356,12 @@ Submit the changes as a PR to the main branch of 1. Address all feedback comments from your reviewers. 1. Make sure all CI checks are passing. -Once you submit your PR, you can see a generated Netlify preview of your -build. You can see an example Netlify preview at the following URL: - -> - +Once you submit your PR, you can see a preview of your +build, titled "Preview Python docs built from this PR", under Helpful Links. +This preview will show you how your tutorial will appear on the website, +but it is not the final version. The final version will be published +after the PR is merged. ## Do not merge the PR yourself ## Please **DO NOT MERGE** your own PR; the tutorial won't be published. In order to avoid potential build breaks with the tutorials site, only certain maintainers can authorize publishing. - diff --git a/Makefile b/Makefile index 0a36670dd6c..7fcf1de6636 100644 --- a/Makefile +++ b/Makefile @@ -61,47 +61,33 @@ download: wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/cornell_movie_dialogs_corpus_v2.zip -P $(DATADIR) unzip $(ZIPOPTS) $(DATADIR)/cornell_movie_dialogs_corpus_v2.zip -d beginner_source/data/ - # Download model for advanced_source/dynamic_quantization_tutorial.py - wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -P $(DATADIR) - cp $(DATADIR)/word_language_model_quantize.pth advanced_source/data/word_language_model_quantize.pth - - # Download data for advanced_source/dynamic_quantization_tutorial.py - wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/wikitext-2.zip -P $(DATADIR) - unzip $(ZIPOPTS) $(DATADIR)/wikitext-2.zip -d advanced_source/data/ - - # Download model for advanced_source/static_quantization_tutorial.py - wget -nv -N https://download.pytorch.org/models/mobilenet_v2-b0353104.pth -P $(DATADIR) - cp $(DATADIR)/mobilenet_v2-b0353104.pth advanced_source/data/mobilenet_pretrained_float.pth - - - # Download model for prototype_source/graph_mode_static_quantization_tutorial.py - wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR) - cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth - - # Download vocab for beginner_source/flava_finetuning_tutorial.py - wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR) - tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/ - - # Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py - wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR) - unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/ - # Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR) unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/ +download-last-reviewed-json: + @echo "Downloading tutorials-review-data.json..." + curl -o tutorials-review-data.json https://raw.githubusercontent.com/pytorch/tutorials/refs/heads/last-reviewed-data-json/tutorials-review-data.json + @echo "Finished downloading tutorials-review-data.json." docs: make download + make download-last-reviewed-json make html + @python .jenkins/insert_last_verified.py $(BUILDDIR)/html rm -rf docs cp -r $(BUILDDIR)/html docs touch docs/.nojekyll + rm -rf tutorials-review-data.json html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)/html" # bash .jenkins/remove_invisible_code_block_batch.sh "$(BUILDDIR)/html" @echo + make download-last-reviewed-json @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + @echo "Running post-processing script to insert 'Last Verified' dates..." + @python .jenkins/insert_last_verified.py $(BUILDDIR)/html + rm -rf tutorials-review-data.json clean-cache: make clean diff --git a/README.md b/README.md index 0c961afd262..3b858a3882b 100644 --- a/README.md +++ b/README.md @@ -22,16 +22,18 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)): +NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md). + 1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`. 2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`. 3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst). -4. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/main/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes_index.rst) +4. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/main/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/main/recipes_index.rst) If you are starting off with a Jupyter notebook, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to Python file. After conversion and addition to the project, please make sure that section headings and other things are in logical order. ## Building locally -The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: +The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: 1. Install required dependencies by running: `pip install -r requirements.txt`. @@ -40,8 +42,6 @@ The tutorial build is very large and requires a GPU. If your machine does not ha - If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step. - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial. -> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`. - ## Building a single tutorial You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run: @@ -57,10 +57,20 @@ GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build The `GALLERY_PATTERN` variable respects regular expressions. +## Spell Check +You can run pyspelling to check for spelling errors in the tutorials. To check only Python files, run pyspelling -n python. To check only .rst files, use pyspelling -n reST. Currently, .rst spell checking is limited to the beginner/ directory. Contributions to enable spell checking in other directories are welcome! + + +``` +pyspelling # full check (~3 mins) +pyspelling -n python # Python files only +pyspelling -n reST # reST files (only beginner/ dir currently included) +``` + ## About contributing to PyTorch Documentation and Tutorials -* You can find information about contributing to PyTorch documentation in the -PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. +* You can find information about contributing to PyTorch documentation in the +PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. * Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md). diff --git a/_static/css/custom.css b/_static/css/custom.css index a467a088159..a0882c1d4fc 100755 --- a/_static/css/custom.css +++ b/_static/css/custom.css @@ -91,3 +91,7 @@ transition: none; transform-origin: none; } + +.pytorch-left-menu-search input[type=text] { + background-image: url("../images/search-icon.svg"); +} diff --git a/_static/css/custom2.css b/_static/css/custom2.css index 4e263b67759..a24ee796872 100644 --- a/_static/css/custom2.css +++ b/_static/css/custom2.css @@ -17,3 +17,96 @@ margin-bottom: 5px; } } + +/* Left nav for 2nd level nav */ + +.pytorch-left-menu li.toctree-l2 { + padding-left: 10px; +} + +.pytorch-left-menu li.toctree-l2.current > a, { + color: #ee4c2c; +} + +.pytorch-left-menu li.toctree-l2.current a:link.reference.internal { + color: #ee4c2c; +} + +.pytorch-left-menu li.toctree-l1.current > a:before { + content: ""; +} + +/* search radio button*/ + +input[type="radio"] { + accent-color: #ee4c2c; +} + +.gsst_b { + display: none; +} + +#gsc-i-id1 { + height: 1.5rem; + text-indent: 12px !important; + font-size: 1rem !important; + font-family: "FreightSansi"; + background-image: url(../images/search-icon.svg) !important; + background-repeat: no-repeat !important; + background-size: 18px 18px !important; + background-position: 5px 0px !important; + padding-left: 20px !important; +} + +#gsc-i-id1::placeholder { + font-family: 'FreightSans'; + font-size: 1rem; + color: #262626; +} + +.gsc-control-cse { + padding: 0 !important; + border-radius: 0px !important; + border: none !important; +} + +.gsc-overflow-hidden { + overflow: visible !important; +} + +#___gcse_0 { + height: 44px !important; + padding: 0 !important; +} + +table.gsc-search-box td.gsc-input { + padding-right: 0 !important; +} + +table.gsc-search-box td { + height: 44px; + margin-bottom: 0 !important; + padding-bottom: 0 !important; +} + +.gsc-search-button-v2 { + display: none; +} + +.gs_id50 { + width: 308px; +} + +.gsib_a { + padding: 0px 8px 4px 9px !important; +} + +.gsc-input-box { + border-radius: 0px !important; + border: none !important; +} + +form.gsc-search-box { + margin-bottom 0px; +} + diff --git a/_static/doctools.js b/_static/doctools.js deleted file mode 100755 index c3db08d1c38..00000000000 --- a/_static/doctools.js +++ /dev/null @@ -1,264 +0,0 @@ -/* - * doctools.js - * ~~~~~~~~~~~ - * - * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ -"use strict"; - -const _ready = (callback) => { - if (document.readyState !== "loading") { - callback(); - } else { - document.addEventListener("DOMContentLoaded", callback); - } -}; - -/** - * highlight a given string on a node by wrapping it in - * span elements with the given class name. - */ -const _highlight = (node, addItems, text, className) => { - if (node.nodeType === Node.TEXT_NODE) { - const val = node.nodeValue; - const parent = node.parentNode; - const pos = val.toLowerCase().indexOf(text); - if ( - pos >= 0 && - !parent.classList.contains(className) && - !parent.classList.contains("nohighlight") - ) { - let span; - - const closestNode = parent.closest("body, svg, foreignObject"); - const isInSVG = closestNode && closestNode.matches("svg"); - if (isInSVG) { - span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); - } else { - span = document.createElement("span"); - span.classList.add(className); - } - - span.appendChild(document.createTextNode(val.substr(pos, text.length))); - parent.insertBefore( - span, - parent.insertBefore( - document.createTextNode(val.substr(pos + text.length)), - node.nextSibling - ) - ); - node.nodeValue = val.substr(0, pos); - - if (isInSVG) { - const rect = document.createElementNS( - "http://www.w3.org/2000/svg", - "rect" - ); - const bbox = parent.getBBox(); - rect.x.baseVal.value = bbox.x; - rect.y.baseVal.value = bbox.y; - rect.width.baseVal.value = bbox.width; - rect.height.baseVal.value = bbox.height; - rect.setAttribute("class", className); - addItems.push({ parent: parent, target: rect }); - } - } - } else if (node.matches && !node.matches("button, select, textarea")) { - node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); - } -}; -const _highlightText = (thisNode, text, className) => { - let addItems = []; - _highlight(thisNode, addItems, text, className); - addItems.forEach((obj) => - obj.parent.insertAdjacentElement("beforebegin", obj.target) - ); -}; - -/** - * Small JavaScript module for the documentation. - */ -const Documentation = { - init: () => { - Documentation.highlightSearchWords(); - Documentation.initDomainIndexTable(); - Documentation.initOnKeyListeners(); - }, - - /** - * i18n support - */ - TRANSLATIONS: {}, - PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), - LOCALE: "unknown", - - // gettext and ngettext don't access this so that the functions - // can safely bound to a different name (_ = Documentation.gettext) - gettext: (string) => { - const translated = Documentation.TRANSLATIONS[string]; - switch (typeof translated) { - case "undefined": - return string; // no translation - case "string": - return translated; // translation exists - default: - return translated[0]; // (singular, plural) translation tuple exists - } - }, - - ngettext: (singular, plural, n) => { - const translated = Documentation.TRANSLATIONS[singular]; - if (typeof translated !== "undefined") - return translated[Documentation.PLURAL_EXPR(n)]; - return n === 1 ? singular : plural; - }, - - addTranslations: (catalog) => { - Object.assign(Documentation.TRANSLATIONS, catalog.messages); - Documentation.PLURAL_EXPR = new Function( - "n", - `return (${catalog.plural_expr})` - ); - Documentation.LOCALE = catalog.locale; - }, - - /** - * highlight the search words provided in the url in the text - */ - highlightSearchWords: () => { - const highlight = - new URLSearchParams(window.location.search).get("highlight") || ""; - const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); - if (terms.length === 0) return; // nothing to do - - // There should never be more than one element matching "div.body" - const divBody = document.querySelectorAll("div.body"); - const body = divBody.length ? divBody[0] : document.querySelector("body"); - window.setTimeout(() => { - terms.forEach((term) => _highlightText(body, term, "highlighted")); - }, 10); - - const searchBox = document.getElementById("searchbox"); - if (searchBox === null) return; - searchBox.appendChild( - document - .createRange() - .createContextualFragment( - '" - ) - ); - }, - - /** - * helper function to hide the search marks again - */ - hideSearchWords: () => { - document - .querySelectorAll("#searchbox .highlight-link") - .forEach((el) => el.remove()); - document - .querySelectorAll("span.highlighted") - .forEach((el) => el.classList.remove("highlighted")); - const url = new URL(window.location); - url.searchParams.delete("highlight"); - window.history.replaceState({}, "", url); - }, - - /** - * helper function to focus on search bar - */ - focusSearchBar: () => { - document.querySelectorAll("input[name=q]")[0]?.focus(); - }, - - /** - * Initialise the domain index toggle buttons - */ - initDomainIndexTable: () => { - const toggler = (el) => { - const idNumber = el.id.substr(7); - const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); - if (el.src.substr(-9) === "minus.png") { - el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; - toggledRows.forEach((el) => (el.style.display = "none")); - } else { - el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; - toggledRows.forEach((el) => (el.style.display = "")); - } - }; - - const togglerElements = document.querySelectorAll("img.toggler"); - togglerElements.forEach((el) => - el.addEventListener("click", (event) => toggler(event.currentTarget)) - ); - togglerElements.forEach((el) => (el.style.display = "")); - if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); - }, - - initOnKeyListeners: () => { - // only install a listener if it is really needed - if ( - !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && - !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS - ) - return; - - const blacklistedElements = new Set([ - "TEXTAREA", - "INPUT", - "SELECT", - "BUTTON", - ]); - document.addEventListener("keydown", (event) => { - if (blacklistedElements.has(document.activeElement.tagName)) return; // bail for input elements - if (event.altKey || event.ctrlKey || event.metaKey) return; // bail with special keys - - if (!event.shiftKey) { - switch (event.key) { - case "ArrowLeft": - if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; - - const prevLink = document.querySelector('link[rel="prev"]'); - if (prevLink && prevLink.href) { - window.location.href = prevLink.href; - event.preventDefault(); - } - break; - case "ArrowRight": - if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; - - const nextLink = document.querySelector('link[rel="next"]'); - if (nextLink && nextLink.href) { - window.location.href = nextLink.href; - event.preventDefault(); - } - break; - case "Escape": - if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; - Documentation.hideSearchWords(); - event.preventDefault(); - } - } - - // some keyboard layouts may need Shift to get / - switch (event.key) { - case "/": - if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; - Documentation.focusSearchBar(); - event.preventDefault(); - } - }); - }, -}; - -// quick alias for translations -const _ = Documentation.gettext; - -_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js deleted file mode 100755 index a9214d61b9c..00000000000 --- a/_static/documentation_options.js +++ /dev/null @@ -1,9 +0,0 @@ -var DOCUMENTATION_OPTIONS = { - URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '0.5.0a0+a24163a', - LANGUAGE: 'None', - COLLAPSE_INDEX: false, - FILE_SUFFIX: '.html', - HAS_SOURCE: true, - SOURCELINK_SUFFIX: '.txt' -}; \ No newline at end of file diff --git a/_static/img/bert.png b/_static/img/bert.png deleted file mode 100644 index 6e23a8acfd3..00000000000 Binary files a/_static/img/bert.png and /dev/null differ diff --git a/_static/img/cat_224x224.jpg b/_static/img/cat_224x224.jpg deleted file mode 100755 index 05660ce53f9..00000000000 Binary files a/_static/img/cat_224x224.jpg and /dev/null differ diff --git a/_static/img/cat_resized.jpg b/_static/img/cat_resized.jpg deleted file mode 100644 index c7746e65308..00000000000 Binary files a/_static/img/cat_resized.jpg and /dev/null differ diff --git a/_static/img/cat_superres_with_ort.jpg b/_static/img/cat_superres_with_ort.jpg deleted file mode 100644 index 7e4143c3e79..00000000000 Binary files a/_static/img/cat_superres_with_ort.jpg and /dev/null differ diff --git a/_static/img/compare_output.png b/_static/img/compare_output.png deleted file mode 100644 index 4ece4d11483..00000000000 Binary files a/_static/img/compare_output.png and /dev/null differ diff --git a/_static/img/compare_stub.png b/_static/img/compare_stub.png deleted file mode 100644 index 8140a99b182..00000000000 Binary files a/_static/img/compare_stub.png and /dev/null differ diff --git a/_static/img/compiled_autograd/call_hook_node.png b/_static/img/compiled_autograd/call_hook_node.png new file mode 100644 index 00000000000..3e094cf6f73 Binary files /dev/null and b/_static/img/compiled_autograd/call_hook_node.png differ diff --git a/_static/img/compiled_autograd/entire_verbose_log.png b/_static/img/compiled_autograd/entire_verbose_log.png new file mode 100644 index 00000000000..4ce2b8538ee Binary files /dev/null and b/_static/img/compiled_autograd/entire_verbose_log.png differ diff --git a/_static/img/compiled_autograd/recompile_due_to_dynamic.png b/_static/img/compiled_autograd/recompile_due_to_dynamic.png new file mode 100644 index 00000000000..41ae56acf2d Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_dynamic.png differ diff --git a/_static/img/compiled_autograd/recompile_due_to_node.png b/_static/img/compiled_autograd/recompile_due_to_node.png new file mode 100644 index 00000000000..800a1784587 Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_node.png differ diff --git a/_static/img/distributed/fsdp_implicit.png b/_static/img/distributed/fsdp_implicit.png new file mode 100644 index 00000000000..85b19b7e72e Binary files /dev/null and b/_static/img/distributed/fsdp_implicit.png differ diff --git a/_static/img/distributed/tcpstore_barrier_time.png b/_static/img/distributed/tcpstore_barrier_time.png new file mode 100644 index 00000000000..5ece3a7471d Binary files /dev/null and b/_static/img/distributed/tcpstore_barrier_time.png differ diff --git a/_static/img/distributed/tcpstore_init_time.png b/_static/img/distributed/tcpstore_init_time.png new file mode 100644 index 00000000000..df514b4dc48 Binary files /dev/null and b/_static/img/distributed/tcpstore_init_time.png differ diff --git a/_static/img/install_msvc.png b/_static/img/install_msvc.png new file mode 100644 index 00000000000..fce73207a80 Binary files /dev/null and b/_static/img/install_msvc.png differ diff --git a/_static/img/itt_tutorial/vtune_xpu_config.png b/_static/img/itt_tutorial/vtune_xpu_config.png new file mode 100644 index 00000000000..80dd1812d26 Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_config.png differ diff --git a/_static/img/itt_tutorial/vtune_xpu_timeline.png b/_static/img/itt_tutorial/vtune_xpu_timeline.png new file mode 100644 index 00000000000..43818cf105c Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_timeline.png differ diff --git a/_static/img/onnx/custom_addandround_function.png b/_static/img/onnx/custom_addandround_function.png deleted file mode 100644 index a0c7000161e..00000000000 Binary files a/_static/img/onnx/custom_addandround_function.png and /dev/null differ diff --git a/_static/img/onnx/custom_addandround_model.png b/_static/img/onnx/custom_addandround_model.png deleted file mode 100644 index 793d8cfbb5d..00000000000 Binary files a/_static/img/onnx/custom_addandround_model.png and /dev/null differ diff --git a/_static/img/onnx/custom_aten_add_function.png b/_static/img/onnx/custom_aten_add_function.png deleted file mode 100644 index d9f927ce707..00000000000 Binary files a/_static/img/onnx/custom_aten_add_function.png and /dev/null differ diff --git a/_static/img/onnx/custom_aten_add_model.png b/_static/img/onnx/custom_aten_add_model.png deleted file mode 100644 index e5ef1c71742..00000000000 Binary files a/_static/img/onnx/custom_aten_add_model.png and /dev/null differ diff --git a/_static/img/onnx/custom_aten_gelu_function.png b/_static/img/onnx/custom_aten_gelu_function.png deleted file mode 100644 index 5cb573e7dcb..00000000000 Binary files a/_static/img/onnx/custom_aten_gelu_function.png and /dev/null differ diff --git a/_static/img/onnx/custom_aten_gelu_model.png b/_static/img/onnx/custom_aten_gelu_model.png deleted file mode 100644 index 6bc46337b48..00000000000 Binary files a/_static/img/onnx/custom_aten_gelu_model.png and /dev/null differ diff --git a/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png b/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png new file mode 100644 index 00000000000..6430e4943ff Binary files /dev/null and b/_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png differ diff --git a/_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png b/_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png deleted file mode 100755 index 0c29c168798..00000000000 Binary files a/_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png and /dev/null differ diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png new file mode 100644 index 00000000000..9d84e9d229d Binary files /dev/null and b/_static/img/pinmem/pinmem.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png new file mode 100644 index 00000000000..dedac997b0b Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png new file mode 100644 index 00000000000..2d5ff462e1a Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned1.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png new file mode 100644 index 00000000000..130182a1978 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png new file mode 100644 index 00000000000..c596fcdb691 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned1.png differ diff --git a/_static/img/python_extension_autoload_impl.png b/_static/img/python_extension_autoload_impl.png new file mode 100644 index 00000000000..64e18fc7b4b Binary files /dev/null and b/_static/img/python_extension_autoload_impl.png differ diff --git a/_static/img/quant_asym.png b/_static/img/quant_asym.png deleted file mode 100644 index 9dc43817a59..00000000000 Binary files a/_static/img/quant_asym.png and /dev/null differ diff --git a/_static/img/quantized_transfer_learning.png b/_static/img/quantized_transfer_learning.png deleted file mode 100644 index c138cbdb0c1..00000000000 Binary files a/_static/img/quantized_transfer_learning.png and /dev/null differ diff --git a/_static/img/shadow.png b/_static/img/shadow.png deleted file mode 100644 index e09d0b87f01..00000000000 Binary files a/_static/img/shadow.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png b/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png deleted file mode 100644 index 34bbf8c7bdf..00000000000 Binary files a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png b/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png deleted file mode 100644 index 986efaa3f88..00000000000 Binary files a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png b/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png deleted file mode 100644 index 34bbf8c7bdf..00000000000 Binary files a/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/mobile.png b/_static/img/thumbnails/cropped/mobile.png deleted file mode 100644 index 12dc917519c..00000000000 Binary files a/_static/img/thumbnails/cropped/mobile.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/optional-Exporting-a-Model-from-PyTorch-to-ONNX-and-Running-it-using-ONNX-Runtime.png b/_static/img/thumbnails/cropped/optional-Exporting-a-Model-from-PyTorch-to-ONNX-and-Running-it-using-ONNX-Runtime.png deleted file mode 100644 index 00156df042e..00000000000 Binary files a/_static/img/thumbnails/cropped/optional-Exporting-a-Model-from-PyTorch-to-ONNX-and-Running-it-using-ONNX-Runtime.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png b/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png new file mode 100644 index 00000000000..0590cf227d9 Binary files /dev/null and b/_static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png differ diff --git a/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png b/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png deleted file mode 100644 index 6ce22e4862a..00000000000 Binary files a/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png b/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png new file mode 100644 index 00000000000..6ff6d97f2e2 Binary files /dev/null and b/_static/img/thumbnails/cropped/visualizing_gradients_tutorial.png differ diff --git a/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp b/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp deleted file mode 100644 index 5a1ca81e07d..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp and /dev/null differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png deleted file mode 100644 index fafd95768a1..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png and /dev/null differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png deleted file mode 100644 index fd6f7aba1f4..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png and /dev/null differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png deleted file mode 100644 index 8feda69de2d..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png and /dev/null differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png deleted file mode 100644 index 8feda69de2d..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png and /dev/null differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png deleted file mode 100644 index e17e03812ce..00000000000 Binary files a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png and /dev/null differ diff --git a/_static/img/trace_xpu_img.png b/_static/img/trace_xpu_img.png new file mode 100644 index 00000000000..2eca0a78cb6 Binary files /dev/null and b/_static/img/trace_xpu_img.png differ diff --git a/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png new file mode 100644 index 00000000000..1fa3d80d339 Binary files /dev/null and b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png differ diff --git a/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png new file mode 100644 index 00000000000..3f76deab3bf Binary files /dev/null and b/_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png differ diff --git a/_static/jquery-3.2.1.js b/_static/jquery-3.2.1.js deleted file mode 100755 index 2cbd2ab50e7..00000000000 --- a/_static/jquery-3.2.1.js +++ /dev/null @@ -1,10253 +0,0 @@ -/*! - * jQuery JavaScript Library v3.2.1 - * https://jquery.com/ - * - * Includes Sizzle.js - * https://sizzlejs.com/ - * - * Copyright JS Foundation and other contributors - * Released under the MIT license - * https://jquery.org/license - * - * Date: 2017-03-20T18:59Z - */ -( function( global, factory ) { - - "use strict"; - - if ( typeof module === "object" && typeof module.exports === "object" ) { - - // For CommonJS and CommonJS-like environments where a proper `window` - // is present, execute the factory and get jQuery. - // For environments that do not have a `window` with a `document` - // (such as Node.js), expose a factory as module.exports. - // This accentuates the need for the creation of a real `window`. - // e.g. var jQuery = require("jquery")(window); - // See ticket #14549 for more info. - module.exports = global.document ? - factory( global, true ) : - function( w ) { - if ( !w.document ) { - throw new Error( "jQuery requires a window with a document" ); - } - return factory( w ); - }; - } else { - factory( global ); - } - -// Pass this if window is not defined yet -} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { - -// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 -// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode -// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common -// enough that all such attempts are guarded in a try block. -"use strict"; - -var arr = []; - -var document = window.document; - -var getProto = Object.getPrototypeOf; - -var slice = arr.slice; - -var concat = arr.concat; - -var push = arr.push; - -var indexOf = arr.indexOf; - -var class2type = {}; - -var toString = class2type.toString; - -var hasOwn = class2type.hasOwnProperty; - -var fnToString = hasOwn.toString; - -var ObjectFunctionString = fnToString.call( Object ); - -var support = {}; - - - - function DOMEval( code, doc ) { - doc = doc || document; - - var script = doc.createElement( "script" ); - - script.text = code; - doc.head.appendChild( script ).parentNode.removeChild( script ); - } -/* global Symbol */ -// Defining this global in .eslintrc.json would create a danger of using the global -// unguarded in another place, it seems safer to define global only for this module - - - -var - version = "3.2.1", - - // Define a local copy of jQuery - jQuery = function( selector, context ) { - - // The jQuery object is actually just the init constructor 'enhanced' - // Need init if jQuery is called (just allow error to be thrown if not included) - return new jQuery.fn.init( selector, context ); - }, - - // Support: Android <=4.0 only - // Make sure we trim BOM and NBSP - rtrim = /^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, - - // Matches dashed string for camelizing - rmsPrefix = /^-ms-/, - rdashAlpha = /-([a-z])/g, - - // Used by jQuery.camelCase as callback to replace() - fcamelCase = function( all, letter ) { - return letter.toUpperCase(); - }; - -jQuery.fn = jQuery.prototype = { - - // The current version of jQuery being used - jquery: version, - - constructor: jQuery, - - // The default length of a jQuery object is 0 - length: 0, - - toArray: function() { - return slice.call( this ); - }, - - // Get the Nth element in the matched element set OR - // Get the whole matched element set as a clean array - get: function( num ) { - - // Return all the elements in a clean array - if ( num == null ) { - return slice.call( this ); - } - - // Return just the one element from the set - return num < 0 ? this[ num + this.length ] : this[ num ]; - }, - - // Take an array of elements and push it onto the stack - // (returning the new matched element set) - pushStack: function( elems ) { - - // Build a new jQuery matched element set - var ret = jQuery.merge( this.constructor(), elems ); - - // Add the old object onto the stack (as a reference) - ret.prevObject = this; - - // Return the newly-formed element set - return ret; - }, - - // Execute a callback for every element in the matched set. - each: function( callback ) { - return jQuery.each( this, callback ); - }, - - map: function( callback ) { - return this.pushStack( jQuery.map( this, function( elem, i ) { - return callback.call( elem, i, elem ); - } ) ); - }, - - slice: function() { - return this.pushStack( slice.apply( this, arguments ) ); - }, - - first: function() { - return this.eq( 0 ); - }, - - last: function() { - return this.eq( -1 ); - }, - - eq: function( i ) { - var len = this.length, - j = +i + ( i < 0 ? len : 0 ); - return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); - }, - - end: function() { - return this.prevObject || this.constructor(); - }, - - // For internal use only. - // Behaves like an Array's method, not like a jQuery method. - push: push, - sort: arr.sort, - splice: arr.splice -}; - -jQuery.extend = jQuery.fn.extend = function() { - var options, name, src, copy, copyIsArray, clone, - target = arguments[ 0 ] || {}, - i = 1, - length = arguments.length, - deep = false; - - // Handle a deep copy situation - if ( typeof target === "boolean" ) { - deep = target; - - // Skip the boolean and the target - target = arguments[ i ] || {}; - i++; - } - - // Handle case when target is a string or something (possible in deep copy) - if ( typeof target !== "object" && !jQuery.isFunction( target ) ) { - target = {}; - } - - // Extend jQuery itself if only one argument is passed - if ( i === length ) { - target = this; - i--; - } - - for ( ; i < length; i++ ) { - - // Only deal with non-null/undefined values - if ( ( options = arguments[ i ] ) != null ) { - - // Extend the base object - for ( name in options ) { - src = target[ name ]; - copy = options[ name ]; - - // Prevent never-ending loop - if ( target === copy ) { - continue; - } - - // Recurse if we're merging plain objects or arrays - if ( deep && copy && ( jQuery.isPlainObject( copy ) || - ( copyIsArray = Array.isArray( copy ) ) ) ) { - - if ( copyIsArray ) { - copyIsArray = false; - clone = src && Array.isArray( src ) ? src : []; - - } else { - clone = src && jQuery.isPlainObject( src ) ? src : {}; - } - - // Never move original objects, clone them - target[ name ] = jQuery.extend( deep, clone, copy ); - - // Don't bring in undefined values - } else if ( copy !== undefined ) { - target[ name ] = copy; - } - } - } - } - - // Return the modified object - return target; -}; - -jQuery.extend( { - - // Unique for each copy of jQuery on the page - expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), - - // Assume jQuery is ready without the ready module - isReady: true, - - error: function( msg ) { - throw new Error( msg ); - }, - - noop: function() {}, - - isFunction: function( obj ) { - return jQuery.type( obj ) === "function"; - }, - - isWindow: function( obj ) { - return obj != null && obj === obj.window; - }, - - isNumeric: function( obj ) { - - // As of jQuery 3.0, isNumeric is limited to - // strings and numbers (primitives or objects) - // that can be coerced to finite numbers (gh-2662) - var type = jQuery.type( obj ); - return ( type === "number" || type === "string" ) && - - // parseFloat NaNs numeric-cast false positives ("") - // ...but misinterprets leading-number strings, particularly hex literals ("0x...") - // subtraction forces infinities to NaN - !isNaN( obj - parseFloat( obj ) ); - }, - - isPlainObject: function( obj ) { - var proto, Ctor; - - // Detect obvious negatives - // Use toString instead of jQuery.type to catch host objects - if ( !obj || toString.call( obj ) !== "[object Object]" ) { - return false; - } - - proto = getProto( obj ); - - // Objects with no prototype (e.g., `Object.create( null )`) are plain - if ( !proto ) { - return true; - } - - // Objects with prototype are plain iff they were constructed by a global Object function - Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; - return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; - }, - - isEmptyObject: function( obj ) { - - /* eslint-disable no-unused-vars */ - // See https://github.com/eslint/eslint/issues/6125 - var name; - - for ( name in obj ) { - return false; - } - return true; - }, - - type: function( obj ) { - if ( obj == null ) { - return obj + ""; - } - - // Support: Android <=2.3 only (functionish RegExp) - return typeof obj === "object" || typeof obj === "function" ? - class2type[ toString.call( obj ) ] || "object" : - typeof obj; - }, - - // Evaluates a script in a global context - globalEval: function( code ) { - DOMEval( code ); - }, - - // Convert dashed to camelCase; used by the css and data modules - // Support: IE <=9 - 11, Edge 12 - 13 - // Microsoft forgot to hump their vendor prefix (#9572) - camelCase: function( string ) { - return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); - }, - - each: function( obj, callback ) { - var length, i = 0; - - if ( isArrayLike( obj ) ) { - length = obj.length; - for ( ; i < length; i++ ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } else { - for ( i in obj ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } - - return obj; - }, - - // Support: Android <=4.0 only - trim: function( text ) { - return text == null ? - "" : - ( text + "" ).replace( rtrim, "" ); - }, - - // results is for internal usage only - makeArray: function( arr, results ) { - var ret = results || []; - - if ( arr != null ) { - if ( isArrayLike( Object( arr ) ) ) { - jQuery.merge( ret, - typeof arr === "string" ? - [ arr ] : arr - ); - } else { - push.call( ret, arr ); - } - } - - return ret; - }, - - inArray: function( elem, arr, i ) { - return arr == null ? -1 : indexOf.call( arr, elem, i ); - }, - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - merge: function( first, second ) { - var len = +second.length, - j = 0, - i = first.length; - - for ( ; j < len; j++ ) { - first[ i++ ] = second[ j ]; - } - - first.length = i; - - return first; - }, - - grep: function( elems, callback, invert ) { - var callbackInverse, - matches = [], - i = 0, - length = elems.length, - callbackExpect = !invert; - - // Go through the array, only saving the items - // that pass the validator function - for ( ; i < length; i++ ) { - callbackInverse = !callback( elems[ i ], i ); - if ( callbackInverse !== callbackExpect ) { - matches.push( elems[ i ] ); - } - } - - return matches; - }, - - // arg is for internal usage only - map: function( elems, callback, arg ) { - var length, value, - i = 0, - ret = []; - - // Go through the array, translating each of the items to their new values - if ( isArrayLike( elems ) ) { - length = elems.length; - for ( ; i < length; i++ ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - - // Go through every key on the object, - } else { - for ( i in elems ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - } - - // Flatten any nested arrays - return concat.apply( [], ret ); - }, - - // A global GUID counter for objects - guid: 1, - - // Bind a function to a context, optionally partially applying any - // arguments. - proxy: function( fn, context ) { - var tmp, args, proxy; - - if ( typeof context === "string" ) { - tmp = fn[ context ]; - context = fn; - fn = tmp; - } - - // Quick check to determine if target is callable, in the spec - // this throws a TypeError, but we will just return undefined. - if ( !jQuery.isFunction( fn ) ) { - return undefined; - } - - // Simulated bind - args = slice.call( arguments, 2 ); - proxy = function() { - return fn.apply( context || this, args.concat( slice.call( arguments ) ) ); - }; - - // Set the guid of unique handler to the same of original handler, so it can be removed - proxy.guid = fn.guid = fn.guid || jQuery.guid++; - - return proxy; - }, - - now: Date.now, - - // jQuery.support is not used in Core but other projects attach their - // properties to it so it needs to exist. - support: support -} ); - -if ( typeof Symbol === "function" ) { - jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; -} - -// Populate the class2type map -jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), -function( i, name ) { - class2type[ "[object " + name + "]" ] = name.toLowerCase(); -} ); - -function isArrayLike( obj ) { - - // Support: real iOS 8.2 only (not reproducible in simulator) - // `in` check used to prevent JIT error (gh-2145) - // hasOwn isn't used here due to false negatives - // regarding Nodelist length in IE - var length = !!obj && "length" in obj && obj.length, - type = jQuery.type( obj ); - - if ( type === "function" || jQuery.isWindow( obj ) ) { - return false; - } - - return type === "array" || length === 0 || - typeof length === "number" && length > 0 && ( length - 1 ) in obj; -} -var Sizzle = -/*! - * Sizzle CSS Selector Engine v2.3.3 - * https://sizzlejs.com/ - * - * Copyright jQuery Foundation and other contributors - * Released under the MIT license - * https://jquery.org/license - * - * Date: 2016-08-08 - */ -(function( window ) { - -var i, - support, - Expr, - getText, - isXML, - tokenize, - compile, - select, - outermostContext, - sortInput, - hasDuplicate, - - // Local document vars - setDocument, - document, - docElem, - documentIsHTML, - rbuggyQSA, - rbuggyMatches, - matches, - contains, - - // Instance-specific data - expando = "sizzle" + 1 * new Date(), - preferredDoc = window.document, - dirruns = 0, - done = 0, - classCache = createCache(), - tokenCache = createCache(), - compilerCache = createCache(), - sortOrder = function( a, b ) { - if ( a === b ) { - hasDuplicate = true; - } - return 0; - }, - - // Instance methods - hasOwn = ({}).hasOwnProperty, - arr = [], - pop = arr.pop, - push_native = arr.push, - push = arr.push, - slice = arr.slice, - // Use a stripped-down indexOf as it's faster than native - // https://jsperf.com/thor-indexof-vs-for/5 - indexOf = function( list, elem ) { - var i = 0, - len = list.length; - for ( ; i < len; i++ ) { - if ( list[i] === elem ) { - return i; - } - } - return -1; - }, - - booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped", - - // Regular expressions - - // https://www.w3.org/TR/css3-selectors/#whitespace - whitespace = "[\\x20\\t\\r\\n\\f]", - - // https://www.w3.org/TR/CSS21/syndata.html#value-def-identifier - identifier = "(?:\\\\.|[\\w-]|[^\0-\\xa0])+", - - // Attribute selectors: https://www.w3.org/TR/selectors/#attribute-selectors - attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + - // Operator (capture 2) - "*([*^$|!~]?=)" + whitespace + - // "Attribute values must be CSS identifiers [capture 5] or strings [capture 3 or capture 4]" - "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + whitespace + - "*\\]", - - pseudos = ":(" + identifier + ")(?:\\((" + - // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: - // 1. quoted (capture 3; capture 4 or capture 5) - "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + - // 2. simple (capture 6) - "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + - // 3. anything else (capture 2) - ".*" + - ")\\)|)", - - // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter - rwhitespace = new RegExp( whitespace + "+", "g" ), - rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ), - - rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), - rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ), - - rattributeQuotes = new RegExp( "=" + whitespace + "*([^\\]'\"]*?)" + whitespace + "*\\]", "g" ), - - rpseudo = new RegExp( pseudos ), - ridentifier = new RegExp( "^" + identifier + "$" ), - - matchExpr = { - "ID": new RegExp( "^#(" + identifier + ")" ), - "CLASS": new RegExp( "^\\.(" + identifier + ")" ), - "TAG": new RegExp( "^(" + identifier + "|[*])" ), - "ATTR": new RegExp( "^" + attributes ), - "PSEUDO": new RegExp( "^" + pseudos ), - "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + whitespace + - "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + whitespace + - "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), - "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), - // For use in libraries implementing .is() - // We use this for POS matching in `select` - "needsContext": new RegExp( "^" + whitespace + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + - whitespace + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) - }, - - rinputs = /^(?:input|select|textarea|button)$/i, - rheader = /^h\d$/i, - - rnative = /^[^{]+\{\s*\[native \w/, - - // Easily-parseable/retrievable ID or TAG or CLASS selectors - rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, - - rsibling = /[+~]/, - - // CSS escapes - // https://www.w3.org/TR/CSS21/syndata.html#escaped-characters - runescape = new RegExp( "\\\\([\\da-f]{1,6}" + whitespace + "?|(" + whitespace + ")|.)", "ig" ), - funescape = function( _, escaped, escapedWhitespace ) { - var high = "0x" + escaped - 0x10000; - // NaN means non-codepoint - // Support: Firefox<24 - // Workaround erroneous numeric interpretation of +"0x" - return high !== high || escapedWhitespace ? - escaped : - high < 0 ? - // BMP codepoint - String.fromCharCode( high + 0x10000 ) : - // Supplemental Plane codepoint (surrogate pair) - String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); - }, - - // CSS string/identifier serialization - // https://drafts.csswg.org/cssom/#common-serializing-idioms - rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, - fcssescape = function( ch, asCodePoint ) { - if ( asCodePoint ) { - - // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER - if ( ch === "\0" ) { - return "\uFFFD"; - } - - // Control characters and (dependent upon position) numbers get escaped as code points - return ch.slice( 0, -1 ) + "\\" + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; - } - - // Other potentially-special ASCII characters get backslash-escaped - return "\\" + ch; - }, - - // Used for iframes - // See setDocument() - // Removing the function wrapper causes a "Permission Denied" - // error in IE - unloadHandler = function() { - setDocument(); - }, - - disabledAncestor = addCombinator( - function( elem ) { - return elem.disabled === true && ("form" in elem || "label" in elem); - }, - { dir: "parentNode", next: "legend" } - ); - -// Optimize for push.apply( _, NodeList ) -try { - push.apply( - (arr = slice.call( preferredDoc.childNodes )), - preferredDoc.childNodes - ); - // Support: Android<4.0 - // Detect silently failing push.apply - arr[ preferredDoc.childNodes.length ].nodeType; -} catch ( e ) { - push = { apply: arr.length ? - - // Leverage slice if possible - function( target, els ) { - push_native.apply( target, slice.call(els) ); - } : - - // Support: IE<9 - // Otherwise append directly - function( target, els ) { - var j = target.length, - i = 0; - // Can't trust NodeList.length - while ( (target[j++] = els[i++]) ) {} - target.length = j - 1; - } - }; -} - -function Sizzle( selector, context, results, seed ) { - var m, i, elem, nid, match, groups, newSelector, - newContext = context && context.ownerDocument, - - // nodeType defaults to 9, since context defaults to document - nodeType = context ? context.nodeType : 9; - - results = results || []; - - // Return early from calls with invalid selector or context - if ( typeof selector !== "string" || !selector || - nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { - - return results; - } - - // Try to shortcut find operations (as opposed to filters) in HTML documents - if ( !seed ) { - - if ( ( context ? context.ownerDocument || context : preferredDoc ) !== document ) { - setDocument( context ); - } - context = context || document; - - if ( documentIsHTML ) { - - // If the selector is sufficiently simple, try using a "get*By*" DOM method - // (excepting DocumentFragment context, where the methods don't exist) - if ( nodeType !== 11 && (match = rquickExpr.exec( selector )) ) { - - // ID selector - if ( (m = match[1]) ) { - - // Document context - if ( nodeType === 9 ) { - if ( (elem = context.getElementById( m )) ) { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( elem.id === m ) { - results.push( elem ); - return results; - } - } else { - return results; - } - - // Element context - } else { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( newContext && (elem = newContext.getElementById( m )) && - contains( context, elem ) && - elem.id === m ) { - - results.push( elem ); - return results; - } - } - - // Type selector - } else if ( match[2] ) { - push.apply( results, context.getElementsByTagName( selector ) ); - return results; - - // Class selector - } else if ( (m = match[3]) && support.getElementsByClassName && - context.getElementsByClassName ) { - - push.apply( results, context.getElementsByClassName( m ) ); - return results; - } - } - - // Take advantage of querySelectorAll - if ( support.qsa && - !compilerCache[ selector + " " ] && - (!rbuggyQSA || !rbuggyQSA.test( selector )) ) { - - if ( nodeType !== 1 ) { - newContext = context; - newSelector = selector; - - // qSA looks outside Element context, which is not what we want - // Thanks to Andrew Dupont for this workaround technique - // Support: IE <=8 - // Exclude object elements - } else if ( context.nodeName.toLowerCase() !== "object" ) { - - // Capture the context ID, setting it first if necessary - if ( (nid = context.getAttribute( "id" )) ) { - nid = nid.replace( rcssescape, fcssescape ); - } else { - context.setAttribute( "id", (nid = expando) ); - } - - // Prefix every selector in the list - groups = tokenize( selector ); - i = groups.length; - while ( i-- ) { - groups[i] = "#" + nid + " " + toSelector( groups[i] ); - } - newSelector = groups.join( "," ); - - // Expand context for sibling selectors - newContext = rsibling.test( selector ) && testContext( context.parentNode ) || - context; - } - - if ( newSelector ) { - try { - push.apply( results, - newContext.querySelectorAll( newSelector ) - ); - return results; - } catch ( qsaError ) { - } finally { - if ( nid === expando ) { - context.removeAttribute( "id" ); - } - } - } - } - } - } - - // All others - return select( selector.replace( rtrim, "$1" ), context, results, seed ); -} - -/** - * Create key-value caches of limited size - * @returns {function(string, object)} Returns the Object data after storing it on itself with - * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) - * deleting the oldest entry - */ -function createCache() { - var keys = []; - - function cache( key, value ) { - // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) - if ( keys.push( key + " " ) > Expr.cacheLength ) { - // Only keep the most recent entries - delete cache[ keys.shift() ]; - } - return (cache[ key + " " ] = value); - } - return cache; -} - -/** - * Mark a function for special use by Sizzle - * @param {Function} fn The function to mark - */ -function markFunction( fn ) { - fn[ expando ] = true; - return fn; -} - -/** - * Support testing using an element - * @param {Function} fn Passed the created element and returns a boolean result - */ -function assert( fn ) { - var el = document.createElement("fieldset"); - - try { - return !!fn( el ); - } catch (e) { - return false; - } finally { - // Remove from its parent by default - if ( el.parentNode ) { - el.parentNode.removeChild( el ); - } - // release memory in IE - el = null; - } -} - -/** - * Adds the same handler for all of the specified attrs - * @param {String} attrs Pipe-separated list of attributes - * @param {Function} handler The method that will be applied - */ -function addHandle( attrs, handler ) { - var arr = attrs.split("|"), - i = arr.length; - - while ( i-- ) { - Expr.attrHandle[ arr[i] ] = handler; - } -} - -/** - * Checks document order of two siblings - * @param {Element} a - * @param {Element} b - * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b - */ -function siblingCheck( a, b ) { - var cur = b && a, - diff = cur && a.nodeType === 1 && b.nodeType === 1 && - a.sourceIndex - b.sourceIndex; - - // Use IE sourceIndex if available on both nodes - if ( diff ) { - return diff; - } - - // Check if b follows a - if ( cur ) { - while ( (cur = cur.nextSibling) ) { - if ( cur === b ) { - return -1; - } - } - } - - return a ? 1 : -1; -} - -/** - * Returns a function to use in pseudos for input types - * @param {String} type - */ -function createInputPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for buttons - * @param {String} type - */ -function createButtonPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return (name === "input" || name === "button") && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for :enabled/:disabled - * @param {Boolean} disabled true for :disabled; false for :enabled - */ -function createDisabledPseudo( disabled ) { - - // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable - return function( elem ) { - - // Only certain elements can match :enabled or :disabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled - if ( "form" in elem ) { - - // Check for inherited disabledness on relevant non-disabled elements: - // * listed form-associated elements in a disabled fieldset - // https://html.spec.whatwg.org/multipage/forms.html#category-listed - // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled - // * option elements in a disabled optgroup - // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled - // All such elements have a "form" property. - if ( elem.parentNode && elem.disabled === false ) { - - // Option elements defer to a parent optgroup if present - if ( "label" in elem ) { - if ( "label" in elem.parentNode ) { - return elem.parentNode.disabled === disabled; - } else { - return elem.disabled === disabled; - } - } - - // Support: IE 6 - 11 - // Use the isDisabled shortcut property to check for disabled fieldset ancestors - return elem.isDisabled === disabled || - - // Where there is no isDisabled, check manually - /* jshint -W018 */ - elem.isDisabled !== !disabled && - disabledAncestor( elem ) === disabled; - } - - return elem.disabled === disabled; - - // Try to winnow out elements that can't be disabled before trusting the disabled property. - // Some victims get caught in our net (label, legend, menu, track), but it shouldn't - // even exist on them, let alone have a boolean value. - } else if ( "label" in elem ) { - return elem.disabled === disabled; - } - - // Remaining elements are neither :enabled nor :disabled - return false; - }; -} - -/** - * Returns a function to use in pseudos for positionals - * @param {Function} fn - */ -function createPositionalPseudo( fn ) { - return markFunction(function( argument ) { - argument = +argument; - return markFunction(function( seed, matches ) { - var j, - matchIndexes = fn( [], seed.length, argument ), - i = matchIndexes.length; - - // Match elements found at the specified indexes - while ( i-- ) { - if ( seed[ (j = matchIndexes[i]) ] ) { - seed[j] = !(matches[j] = seed[j]); - } - } - }); - }); -} - -/** - * Checks a node for validity as a Sizzle context - * @param {Element|Object=} context - * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value - */ -function testContext( context ) { - return context && typeof context.getElementsByTagName !== "undefined" && context; -} - -// Expose support vars for convenience -support = Sizzle.support = {}; - -/** - * Detects XML nodes - * @param {Element|Object} elem An element or a document - * @returns {Boolean} True iff elem is a non-HTML XML node - */ -isXML = Sizzle.isXML = function( elem ) { - // documentElement is verified for cases where it doesn't yet exist - // (such as loading iframes in IE - #4833) - var documentElement = elem && (elem.ownerDocument || elem).documentElement; - return documentElement ? documentElement.nodeName !== "HTML" : false; -}; - -/** - * Sets document-related variables once based on the current document - * @param {Element|Object} [doc] An element or document object to use to set the document - * @returns {Object} Returns the current document - */ -setDocument = Sizzle.setDocument = function( node ) { - var hasCompare, subWindow, - doc = node ? node.ownerDocument || node : preferredDoc; - - // Return early if doc is invalid or already selected - if ( doc === document || doc.nodeType !== 9 || !doc.documentElement ) { - return document; - } - - // Update global variables - document = doc; - docElem = document.documentElement; - documentIsHTML = !isXML( document ); - - // Support: IE 9-11, Edge - // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) - if ( preferredDoc !== document && - (subWindow = document.defaultView) && subWindow.top !== subWindow ) { - - // Support: IE 11, Edge - if ( subWindow.addEventListener ) { - subWindow.addEventListener( "unload", unloadHandler, false ); - - // Support: IE 9 - 10 only - } else if ( subWindow.attachEvent ) { - subWindow.attachEvent( "onunload", unloadHandler ); - } - } - - /* Attributes - ---------------------------------------------------------------------- */ - - // Support: IE<8 - // Verify that getAttribute really returns attributes and not properties - // (excepting IE8 booleans) - support.attributes = assert(function( el ) { - el.className = "i"; - return !el.getAttribute("className"); - }); - - /* getElement(s)By* - ---------------------------------------------------------------------- */ - - // Check if getElementsByTagName("*") returns only elements - support.getElementsByTagName = assert(function( el ) { - el.appendChild( document.createComment("") ); - return !el.getElementsByTagName("*").length; - }); - - // Support: IE<9 - support.getElementsByClassName = rnative.test( document.getElementsByClassName ); - - // Support: IE<10 - // Check if getElementById returns elements by name - // The broken getElementById methods don't pick up programmatically-set names, - // so use a roundabout getElementsByName test - support.getById = assert(function( el ) { - docElem.appendChild( el ).id = expando; - return !document.getElementsByName || !document.getElementsByName( expando ).length; - }); - - // ID filter and find - if ( support.getById ) { - Expr.filter["ID"] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - return elem.getAttribute("id") === attrId; - }; - }; - Expr.find["ID"] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var elem = context.getElementById( id ); - return elem ? [ elem ] : []; - } - }; - } else { - Expr.filter["ID"] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - var node = typeof elem.getAttributeNode !== "undefined" && - elem.getAttributeNode("id"); - return node && node.value === attrId; - }; - }; - - // Support: IE 6 - 7 only - // getElementById is not reliable as a find shortcut - Expr.find["ID"] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var node, i, elems, - elem = context.getElementById( id ); - - if ( elem ) { - - // Verify the id attribute - node = elem.getAttributeNode("id"); - if ( node && node.value === id ) { - return [ elem ]; - } - - // Fall back on getElementsByName - elems = context.getElementsByName( id ); - i = 0; - while ( (elem = elems[i++]) ) { - node = elem.getAttributeNode("id"); - if ( node && node.value === id ) { - return [ elem ]; - } - } - } - - return []; - } - }; - } - - // Tag - Expr.find["TAG"] = support.getElementsByTagName ? - function( tag, context ) { - if ( typeof context.getElementsByTagName !== "undefined" ) { - return context.getElementsByTagName( tag ); - - // DocumentFragment nodes don't have gEBTN - } else if ( support.qsa ) { - return context.querySelectorAll( tag ); - } - } : - - function( tag, context ) { - var elem, - tmp = [], - i = 0, - // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too - results = context.getElementsByTagName( tag ); - - // Filter out possible comments - if ( tag === "*" ) { - while ( (elem = results[i++]) ) { - if ( elem.nodeType === 1 ) { - tmp.push( elem ); - } - } - - return tmp; - } - return results; - }; - - // Class - Expr.find["CLASS"] = support.getElementsByClassName && function( className, context ) { - if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { - return context.getElementsByClassName( className ); - } - }; - - /* QSA/matchesSelector - ---------------------------------------------------------------------- */ - - // QSA and matchesSelector support - - // matchesSelector(:active) reports false when true (IE9/Opera 11.5) - rbuggyMatches = []; - - // qSa(:focus) reports false when true (Chrome 21) - // We allow this because of a bug in IE8/9 that throws an error - // whenever `document.activeElement` is accessed on an iframe - // So, we allow :focus to pass through QSA all the time to avoid the IE error - // See https://bugs.jquery.com/ticket/13378 - rbuggyQSA = []; - - if ( (support.qsa = rnative.test( document.querySelectorAll )) ) { - // Build QSA regex - // Regex strategy adopted from Diego Perini - assert(function( el ) { - // Select is set to empty string on purpose - // This is to test IE's treatment of not explicitly - // setting a boolean content attribute, - // since its presence should be enough - // https://bugs.jquery.com/ticket/12359 - docElem.appendChild( el ).innerHTML = "" + - ""; - - // Support: IE8, Opera 11-12.16 - // Nothing should be selected when empty strings follow ^= or $= or *= - // The test attribute must be unknown in Opera but "safe" for WinRT - // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section - if ( el.querySelectorAll("[msallowcapture^='']").length ) { - rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); - } - - // Support: IE8 - // Boolean attributes and "value" are not treated correctly - if ( !el.querySelectorAll("[selected]").length ) { - rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); - } - - // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ - if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { - rbuggyQSA.push("~="); - } - - // Webkit/Opera - :checked should return selected option elements - // https://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - // IE8 throws error here and will not see later tests - if ( !el.querySelectorAll(":checked").length ) { - rbuggyQSA.push(":checked"); - } - - // Support: Safari 8+, iOS 8+ - // https://bugs.webkit.org/show_bug.cgi?id=136851 - // In-page `selector#id sibling-combinator selector` fails - if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { - rbuggyQSA.push(".#.+[+~]"); - } - }); - - assert(function( el ) { - el.innerHTML = "" + - ""; - - // Support: Windows 8 Native Apps - // The type and name attributes are restricted during .innerHTML assignment - var input = document.createElement("input"); - input.setAttribute( "type", "hidden" ); - el.appendChild( input ).setAttribute( "name", "D" ); - - // Support: IE8 - // Enforce case-sensitivity of name attribute - if ( el.querySelectorAll("[name=d]").length ) { - rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); - } - - // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) - // IE8 throws error here and will not see later tests - if ( el.querySelectorAll(":enabled").length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Support: IE9-11+ - // IE's :disabled selector does not pick up the children of disabled fieldsets - docElem.appendChild( el ).disabled = true; - if ( el.querySelectorAll(":disabled").length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Opera 10-11 does not throw on post-comma invalid pseudos - el.querySelectorAll("*,:x"); - rbuggyQSA.push(",.*:"); - }); - } - - if ( (support.matchesSelector = rnative.test( (matches = docElem.matches || - docElem.webkitMatchesSelector || - docElem.mozMatchesSelector || - docElem.oMatchesSelector || - docElem.msMatchesSelector) )) ) { - - assert(function( el ) { - // Check to see if it's possible to do matchesSelector - // on a disconnected node (IE 9) - support.disconnectedMatch = matches.call( el, "*" ); - - // This should fail with an exception - // Gecko does not error, returns false instead - matches.call( el, "[s!='']:x" ); - rbuggyMatches.push( "!=", pseudos ); - }); - } - - rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join("|") ); - rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join("|") ); - - /* Contains - ---------------------------------------------------------------------- */ - hasCompare = rnative.test( docElem.compareDocumentPosition ); - - // Element contains another - // Purposefully self-exclusive - // As in, an element does not contain itself - contains = hasCompare || rnative.test( docElem.contains ) ? - function( a, b ) { - var adown = a.nodeType === 9 ? a.documentElement : a, - bup = b && b.parentNode; - return a === bup || !!( bup && bup.nodeType === 1 && ( - adown.contains ? - adown.contains( bup ) : - a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 - )); - } : - function( a, b ) { - if ( b ) { - while ( (b = b.parentNode) ) { - if ( b === a ) { - return true; - } - } - } - return false; - }; - - /* Sorting - ---------------------------------------------------------------------- */ - - // Document order sorting - sortOrder = hasCompare ? - function( a, b ) { - - // Flag for duplicate removal - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - // Sort on method existence if only one input has compareDocumentPosition - var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; - if ( compare ) { - return compare; - } - - // Calculate position if both inputs belong to the same document - compare = ( a.ownerDocument || a ) === ( b.ownerDocument || b ) ? - a.compareDocumentPosition( b ) : - - // Otherwise we know they are disconnected - 1; - - // Disconnected nodes - if ( compare & 1 || - (!support.sortDetached && b.compareDocumentPosition( a ) === compare) ) { - - // Choose the first element that is related to our preferred document - if ( a === document || a.ownerDocument === preferredDoc && contains(preferredDoc, a) ) { - return -1; - } - if ( b === document || b.ownerDocument === preferredDoc && contains(preferredDoc, b) ) { - return 1; - } - - // Maintain original order - return sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - } - - return compare & 4 ? -1 : 1; - } : - function( a, b ) { - // Exit early if the nodes are identical - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - var cur, - i = 0, - aup = a.parentNode, - bup = b.parentNode, - ap = [ a ], - bp = [ b ]; - - // Parentless nodes are either documents or disconnected - if ( !aup || !bup ) { - return a === document ? -1 : - b === document ? 1 : - aup ? -1 : - bup ? 1 : - sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - - // If the nodes are siblings, we can do a quick check - } else if ( aup === bup ) { - return siblingCheck( a, b ); - } - - // Otherwise we need full lists of their ancestors for comparison - cur = a; - while ( (cur = cur.parentNode) ) { - ap.unshift( cur ); - } - cur = b; - while ( (cur = cur.parentNode) ) { - bp.unshift( cur ); - } - - // Walk down the tree looking for a discrepancy - while ( ap[i] === bp[i] ) { - i++; - } - - return i ? - // Do a sibling check if the nodes have a common ancestor - siblingCheck( ap[i], bp[i] ) : - - // Otherwise nodes in our document sort first - ap[i] === preferredDoc ? -1 : - bp[i] === preferredDoc ? 1 : - 0; - }; - - return document; -}; - -Sizzle.matches = function( expr, elements ) { - return Sizzle( expr, null, null, elements ); -}; - -Sizzle.matchesSelector = function( elem, expr ) { - // Set document vars if needed - if ( ( elem.ownerDocument || elem ) !== document ) { - setDocument( elem ); - } - - // Make sure that attribute selectors are quoted - expr = expr.replace( rattributeQuotes, "='$1']" ); - - if ( support.matchesSelector && documentIsHTML && - !compilerCache[ expr + " " ] && - ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && - ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { - - try { - var ret = matches.call( elem, expr ); - - // IE 9's matchesSelector returns false on disconnected nodes - if ( ret || support.disconnectedMatch || - // As well, disconnected nodes are said to be in a document - // fragment in IE 9 - elem.document && elem.document.nodeType !== 11 ) { - return ret; - } - } catch (e) {} - } - - return Sizzle( expr, document, null, [ elem ] ).length > 0; -}; - -Sizzle.contains = function( context, elem ) { - // Set document vars if needed - if ( ( context.ownerDocument || context ) !== document ) { - setDocument( context ); - } - return contains( context, elem ); -}; - -Sizzle.attr = function( elem, name ) { - // Set document vars if needed - if ( ( elem.ownerDocument || elem ) !== document ) { - setDocument( elem ); - } - - var fn = Expr.attrHandle[ name.toLowerCase() ], - // Don't get fooled by Object.prototype properties (jQuery #13807) - val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? - fn( elem, name, !documentIsHTML ) : - undefined; - - return val !== undefined ? - val : - support.attributes || !documentIsHTML ? - elem.getAttribute( name ) : - (val = elem.getAttributeNode(name)) && val.specified ? - val.value : - null; -}; - -Sizzle.escape = function( sel ) { - return (sel + "").replace( rcssescape, fcssescape ); -}; - -Sizzle.error = function( msg ) { - throw new Error( "Syntax error, unrecognized expression: " + msg ); -}; - -/** - * Document sorting and removing duplicates - * @param {ArrayLike} results - */ -Sizzle.uniqueSort = function( results ) { - var elem, - duplicates = [], - j = 0, - i = 0; - - // Unless we *know* we can detect duplicates, assume their presence - hasDuplicate = !support.detectDuplicates; - sortInput = !support.sortStable && results.slice( 0 ); - results.sort( sortOrder ); - - if ( hasDuplicate ) { - while ( (elem = results[i++]) ) { - if ( elem === results[ i ] ) { - j = duplicates.push( i ); - } - } - while ( j-- ) { - results.splice( duplicates[ j ], 1 ); - } - } - - // Clear input after sorting to release objects - // See https://github.com/jquery/sizzle/pull/225 - sortInput = null; - - return results; -}; - -/** - * Utility function for retrieving the text value of an array of DOM nodes - * @param {Array|Element} elem - */ -getText = Sizzle.getText = function( elem ) { - var node, - ret = "", - i = 0, - nodeType = elem.nodeType; - - if ( !nodeType ) { - // If no nodeType, this is expected to be an array - while ( (node = elem[i++]) ) { - // Do not traverse comment nodes - ret += getText( node ); - } - } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { - // Use textContent for elements - // innerText usage removed for consistency of new lines (jQuery #11153) - if ( typeof elem.textContent === "string" ) { - return elem.textContent; - } else { - // Traverse its children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - ret += getText( elem ); - } - } - } else if ( nodeType === 3 || nodeType === 4 ) { - return elem.nodeValue; - } - // Do not include comment or processing instruction nodes - - return ret; -}; - -Expr = Sizzle.selectors = { - - // Can be adjusted by the user - cacheLength: 50, - - createPseudo: markFunction, - - match: matchExpr, - - attrHandle: {}, - - find: {}, - - relative: { - ">": { dir: "parentNode", first: true }, - " ": { dir: "parentNode" }, - "+": { dir: "previousSibling", first: true }, - "~": { dir: "previousSibling" } - }, - - preFilter: { - "ATTR": function( match ) { - match[1] = match[1].replace( runescape, funescape ); - - // Move the given value to match[3] whether quoted or unquoted - match[3] = ( match[3] || match[4] || match[5] || "" ).replace( runescape, funescape ); - - if ( match[2] === "~=" ) { - match[3] = " " + match[3] + " "; - } - - return match.slice( 0, 4 ); - }, - - "CHILD": function( match ) { - /* matches from matchExpr["CHILD"] - 1 type (only|nth|...) - 2 what (child|of-type) - 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) - 4 xn-component of xn+y argument ([+-]?\d*n|) - 5 sign of xn-component - 6 x of xn-component - 7 sign of y-component - 8 y of y-component - */ - match[1] = match[1].toLowerCase(); - - if ( match[1].slice( 0, 3 ) === "nth" ) { - // nth-* requires argument - if ( !match[3] ) { - Sizzle.error( match[0] ); - } - - // numeric x and y parameters for Expr.filter.CHILD - // remember that false/true cast respectively to 0/1 - match[4] = +( match[4] ? match[5] + (match[6] || 1) : 2 * ( match[3] === "even" || match[3] === "odd" ) ); - match[5] = +( ( match[7] + match[8] ) || match[3] === "odd" ); - - // other types prohibit arguments - } else if ( match[3] ) { - Sizzle.error( match[0] ); - } - - return match; - }, - - "PSEUDO": function( match ) { - var excess, - unquoted = !match[6] && match[2]; - - if ( matchExpr["CHILD"].test( match[0] ) ) { - return null; - } - - // Accept quoted arguments as-is - if ( match[3] ) { - match[2] = match[4] || match[5] || ""; - - // Strip excess characters from unquoted arguments - } else if ( unquoted && rpseudo.test( unquoted ) && - // Get excess from tokenize (recursively) - (excess = tokenize( unquoted, true )) && - // advance to the next closing parenthesis - (excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length) ) { - - // excess is a negative index - match[0] = match[0].slice( 0, excess ); - match[2] = unquoted.slice( 0, excess ); - } - - // Return only captures needed by the pseudo filter method (type and argument) - return match.slice( 0, 3 ); - } - }, - - filter: { - - "TAG": function( nodeNameSelector ) { - var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); - return nodeNameSelector === "*" ? - function() { return true; } : - function( elem ) { - return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; - }; - }, - - "CLASS": function( className ) { - var pattern = classCache[ className + " " ]; - - return pattern || - (pattern = new RegExp( "(^|" + whitespace + ")" + className + "(" + whitespace + "|$)" )) && - classCache( className, function( elem ) { - return pattern.test( typeof elem.className === "string" && elem.className || typeof elem.getAttribute !== "undefined" && elem.getAttribute("class") || "" ); - }); - }, - - "ATTR": function( name, operator, check ) { - return function( elem ) { - var result = Sizzle.attr( elem, name ); - - if ( result == null ) { - return operator === "!="; - } - if ( !operator ) { - return true; - } - - result += ""; - - return operator === "=" ? result === check : - operator === "!=" ? result !== check : - operator === "^=" ? check && result.indexOf( check ) === 0 : - operator === "*=" ? check && result.indexOf( check ) > -1 : - operator === "$=" ? check && result.slice( -check.length ) === check : - operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : - operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : - false; - }; - }, - - "CHILD": function( type, what, argument, first, last ) { - var simple = type.slice( 0, 3 ) !== "nth", - forward = type.slice( -4 ) !== "last", - ofType = what === "of-type"; - - return first === 1 && last === 0 ? - - // Shortcut for :nth-*(n) - function( elem ) { - return !!elem.parentNode; - } : - - function( elem, context, xml ) { - var cache, uniqueCache, outerCache, node, nodeIndex, start, - dir = simple !== forward ? "nextSibling" : "previousSibling", - parent = elem.parentNode, - name = ofType && elem.nodeName.toLowerCase(), - useCache = !xml && !ofType, - diff = false; - - if ( parent ) { - - // :(first|last|only)-(child|of-type) - if ( simple ) { - while ( dir ) { - node = elem; - while ( (node = node[ dir ]) ) { - if ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) { - - return false; - } - } - // Reverse direction for :only-* (if we haven't yet done so) - start = dir = type === "only" && !start && "nextSibling"; - } - return true; - } - - start = [ forward ? parent.firstChild : parent.lastChild ]; - - // non-xml :nth-child(...) stores cache data on `parent` - if ( forward && useCache ) { - - // Seek `elem` from a previously-cached index - - // ...in a gzip-friendly way - node = parent; - outerCache = node[ expando ] || (node[ expando ] = {}); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - (outerCache[ node.uniqueID ] = {}); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex && cache[ 2 ]; - node = nodeIndex && parent.childNodes[ nodeIndex ]; - - while ( (node = ++nodeIndex && node && node[ dir ] || - - // Fallback to seeking `elem` from the start - (diff = nodeIndex = 0) || start.pop()) ) { - - // When found, cache indexes on `parent` and break - if ( node.nodeType === 1 && ++diff && node === elem ) { - uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; - break; - } - } - - } else { - // Use previously-cached element index if available - if ( useCache ) { - // ...in a gzip-friendly way - node = elem; - outerCache = node[ expando ] || (node[ expando ] = {}); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - (outerCache[ node.uniqueID ] = {}); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex; - } - - // xml :nth-child(...) - // or :nth-last-child(...) or :nth(-last)?-of-type(...) - if ( diff === false ) { - // Use the same loop as above to seek `elem` from the start - while ( (node = ++nodeIndex && node && node[ dir ] || - (diff = nodeIndex = 0) || start.pop()) ) { - - if ( ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) && - ++diff ) { - - // Cache the index of each encountered element - if ( useCache ) { - outerCache = node[ expando ] || (node[ expando ] = {}); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - (outerCache[ node.uniqueID ] = {}); - - uniqueCache[ type ] = [ dirruns, diff ]; - } - - if ( node === elem ) { - break; - } - } - } - } - } - - // Incorporate the offset, then check against cycle size - diff -= last; - return diff === first || ( diff % first === 0 && diff / first >= 0 ); - } - }; - }, - - "PSEUDO": function( pseudo, argument ) { - // pseudo-class names are case-insensitive - // https://www.w3.org/TR/selectors/#pseudo-classes - // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters - // Remember that setFilters inherits from pseudos - var args, - fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || - Sizzle.error( "unsupported pseudo: " + pseudo ); - - // The user may use createPseudo to indicate that - // arguments are needed to create the filter function - // just as Sizzle does - if ( fn[ expando ] ) { - return fn( argument ); - } - - // But maintain support for old signatures - if ( fn.length > 1 ) { - args = [ pseudo, pseudo, "", argument ]; - return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? - markFunction(function( seed, matches ) { - var idx, - matched = fn( seed, argument ), - i = matched.length; - while ( i-- ) { - idx = indexOf( seed, matched[i] ); - seed[ idx ] = !( matches[ idx ] = matched[i] ); - } - }) : - function( elem ) { - return fn( elem, 0, args ); - }; - } - - return fn; - } - }, - - pseudos: { - // Potentially complex pseudos - "not": markFunction(function( selector ) { - // Trim the selector passed to compile - // to avoid treating leading and trailing - // spaces as combinators - var input = [], - results = [], - matcher = compile( selector.replace( rtrim, "$1" ) ); - - return matcher[ expando ] ? - markFunction(function( seed, matches, context, xml ) { - var elem, - unmatched = matcher( seed, null, xml, [] ), - i = seed.length; - - // Match elements unmatched by `matcher` - while ( i-- ) { - if ( (elem = unmatched[i]) ) { - seed[i] = !(matches[i] = elem); - } - } - }) : - function( elem, context, xml ) { - input[0] = elem; - matcher( input, null, xml, results ); - // Don't keep the element (issue #299) - input[0] = null; - return !results.pop(); - }; - }), - - "has": markFunction(function( selector ) { - return function( elem ) { - return Sizzle( selector, elem ).length > 0; - }; - }), - - "contains": markFunction(function( text ) { - text = text.replace( runescape, funescape ); - return function( elem ) { - return ( elem.textContent || elem.innerText || getText( elem ) ).indexOf( text ) > -1; - }; - }), - - // "Whether an element is represented by a :lang() selector - // is based solely on the element's language value - // being equal to the identifier C, - // or beginning with the identifier C immediately followed by "-". - // The matching of C against the element's language value is performed case-insensitively. - // The identifier C does not have to be a valid language name." - // https://www.w3.org/TR/selectors/#lang-pseudo - "lang": markFunction( function( lang ) { - // lang value must be a valid identifier - if ( !ridentifier.test(lang || "") ) { - Sizzle.error( "unsupported lang: " + lang ); - } - lang = lang.replace( runescape, funescape ).toLowerCase(); - return function( elem ) { - var elemLang; - do { - if ( (elemLang = documentIsHTML ? - elem.lang : - elem.getAttribute("xml:lang") || elem.getAttribute("lang")) ) { - - elemLang = elemLang.toLowerCase(); - return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; - } - } while ( (elem = elem.parentNode) && elem.nodeType === 1 ); - return false; - }; - }), - - // Miscellaneous - "target": function( elem ) { - var hash = window.location && window.location.hash; - return hash && hash.slice( 1 ) === elem.id; - }, - - "root": function( elem ) { - return elem === docElem; - }, - - "focus": function( elem ) { - return elem === document.activeElement && (!document.hasFocus || document.hasFocus()) && !!(elem.type || elem.href || ~elem.tabIndex); - }, - - // Boolean properties - "enabled": createDisabledPseudo( false ), - "disabled": createDisabledPseudo( true ), - - "checked": function( elem ) { - // In CSS3, :checked should return both checked and selected elements - // https://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - var nodeName = elem.nodeName.toLowerCase(); - return (nodeName === "input" && !!elem.checked) || (nodeName === "option" && !!elem.selected); - }, - - "selected": function( elem ) { - // Accessing this property makes selected-by-default - // options in Safari work properly - if ( elem.parentNode ) { - elem.parentNode.selectedIndex; - } - - return elem.selected === true; - }, - - // Contents - "empty": function( elem ) { - // https://www.w3.org/TR/selectors/#empty-pseudo - // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), - // but not by others (comment: 8; processing instruction: 7; etc.) - // nodeType < 6 works because attributes (2) do not appear as children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - if ( elem.nodeType < 6 ) { - return false; - } - } - return true; - }, - - "parent": function( elem ) { - return !Expr.pseudos["empty"]( elem ); - }, - - // Element/input types - "header": function( elem ) { - return rheader.test( elem.nodeName ); - }, - - "input": function( elem ) { - return rinputs.test( elem.nodeName ); - }, - - "button": function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === "button" || name === "button"; - }, - - "text": function( elem ) { - var attr; - return elem.nodeName.toLowerCase() === "input" && - elem.type === "text" && - - // Support: IE<8 - // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" - ( (attr = elem.getAttribute("type")) == null || attr.toLowerCase() === "text" ); - }, - - // Position-in-collection - "first": createPositionalPseudo(function() { - return [ 0 ]; - }), - - "last": createPositionalPseudo(function( matchIndexes, length ) { - return [ length - 1 ]; - }), - - "eq": createPositionalPseudo(function( matchIndexes, length, argument ) { - return [ argument < 0 ? argument + length : argument ]; - }), - - "even": createPositionalPseudo(function( matchIndexes, length ) { - var i = 0; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "odd": createPositionalPseudo(function( matchIndexes, length ) { - var i = 1; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "lt": createPositionalPseudo(function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; --i >= 0; ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "gt": createPositionalPseudo(function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; ++i < length; ) { - matchIndexes.push( i ); - } - return matchIndexes; - }) - } -}; - -Expr.pseudos["nth"] = Expr.pseudos["eq"]; - -// Add button/input type pseudos -for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { - Expr.pseudos[ i ] = createInputPseudo( i ); -} -for ( i in { submit: true, reset: true } ) { - Expr.pseudos[ i ] = createButtonPseudo( i ); -} - -// Easy API for creating new setFilters -function setFilters() {} -setFilters.prototype = Expr.filters = Expr.pseudos; -Expr.setFilters = new setFilters(); - -tokenize = Sizzle.tokenize = function( selector, parseOnly ) { - var matched, match, tokens, type, - soFar, groups, preFilters, - cached = tokenCache[ selector + " " ]; - - if ( cached ) { - return parseOnly ? 0 : cached.slice( 0 ); - } - - soFar = selector; - groups = []; - preFilters = Expr.preFilter; - - while ( soFar ) { - - // Comma and first run - if ( !matched || (match = rcomma.exec( soFar )) ) { - if ( match ) { - // Don't consume trailing commas as valid - soFar = soFar.slice( match[0].length ) || soFar; - } - groups.push( (tokens = []) ); - } - - matched = false; - - // Combinators - if ( (match = rcombinators.exec( soFar )) ) { - matched = match.shift(); - tokens.push({ - value: matched, - // Cast descendant combinators to space - type: match[0].replace( rtrim, " " ) - }); - soFar = soFar.slice( matched.length ); - } - - // Filters - for ( type in Expr.filter ) { - if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] || - (match = preFilters[ type ]( match ))) ) { - matched = match.shift(); - tokens.push({ - value: matched, - type: type, - matches: match - }); - soFar = soFar.slice( matched.length ); - } - } - - if ( !matched ) { - break; - } - } - - // Return the length of the invalid excess - // if we're just parsing - // Otherwise, throw an error or return tokens - return parseOnly ? - soFar.length : - soFar ? - Sizzle.error( selector ) : - // Cache the tokens - tokenCache( selector, groups ).slice( 0 ); -}; - -function toSelector( tokens ) { - var i = 0, - len = tokens.length, - selector = ""; - for ( ; i < len; i++ ) { - selector += tokens[i].value; - } - return selector; -} - -function addCombinator( matcher, combinator, base ) { - var dir = combinator.dir, - skip = combinator.next, - key = skip || dir, - checkNonElements = base && key === "parentNode", - doneName = done++; - - return combinator.first ? - // Check against closest ancestor/preceding element - function( elem, context, xml ) { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - return matcher( elem, context, xml ); - } - } - return false; - } : - - // Check against all ancestor/preceding elements - function( elem, context, xml ) { - var oldCache, uniqueCache, outerCache, - newCache = [ dirruns, doneName ]; - - // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching - if ( xml ) { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - if ( matcher( elem, context, xml ) ) { - return true; - } - } - } - } else { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - outerCache = elem[ expando ] || (elem[ expando ] = {}); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ elem.uniqueID ] || (outerCache[ elem.uniqueID ] = {}); - - if ( skip && skip === elem.nodeName.toLowerCase() ) { - elem = elem[ dir ] || elem; - } else if ( (oldCache = uniqueCache[ key ]) && - oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { - - // Assign to newCache so results back-propagate to previous elements - return (newCache[ 2 ] = oldCache[ 2 ]); - } else { - // Reuse newcache so results back-propagate to previous elements - uniqueCache[ key ] = newCache; - - // A match means we're done; a fail means we have to keep checking - if ( (newCache[ 2 ] = matcher( elem, context, xml )) ) { - return true; - } - } - } - } - } - return false; - }; -} - -function elementMatcher( matchers ) { - return matchers.length > 1 ? - function( elem, context, xml ) { - var i = matchers.length; - while ( i-- ) { - if ( !matchers[i]( elem, context, xml ) ) { - return false; - } - } - return true; - } : - matchers[0]; -} - -function multipleContexts( selector, contexts, results ) { - var i = 0, - len = contexts.length; - for ( ; i < len; i++ ) { - Sizzle( selector, contexts[i], results ); - } - return results; -} - -function condense( unmatched, map, filter, context, xml ) { - var elem, - newUnmatched = [], - i = 0, - len = unmatched.length, - mapped = map != null; - - for ( ; i < len; i++ ) { - if ( (elem = unmatched[i]) ) { - if ( !filter || filter( elem, context, xml ) ) { - newUnmatched.push( elem ); - if ( mapped ) { - map.push( i ); - } - } - } - } - - return newUnmatched; -} - -function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { - if ( postFilter && !postFilter[ expando ] ) { - postFilter = setMatcher( postFilter ); - } - if ( postFinder && !postFinder[ expando ] ) { - postFinder = setMatcher( postFinder, postSelector ); - } - return markFunction(function( seed, results, context, xml ) { - var temp, i, elem, - preMap = [], - postMap = [], - preexisting = results.length, - - // Get initial elements from seed or context - elems = seed || multipleContexts( selector || "*", context.nodeType ? [ context ] : context, [] ), - - // Prefilter to get matcher input, preserving a map for seed-results synchronization - matcherIn = preFilter && ( seed || !selector ) ? - condense( elems, preMap, preFilter, context, xml ) : - elems, - - matcherOut = matcher ? - // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, - postFinder || ( seed ? preFilter : preexisting || postFilter ) ? - - // ...intermediate processing is necessary - [] : - - // ...otherwise use results directly - results : - matcherIn; - - // Find primary matches - if ( matcher ) { - matcher( matcherIn, matcherOut, context, xml ); - } - - // Apply postFilter - if ( postFilter ) { - temp = condense( matcherOut, postMap ); - postFilter( temp, [], context, xml ); - - // Un-match failing elements by moving them back to matcherIn - i = temp.length; - while ( i-- ) { - if ( (elem = temp[i]) ) { - matcherOut[ postMap[i] ] = !(matcherIn[ postMap[i] ] = elem); - } - } - } - - if ( seed ) { - if ( postFinder || preFilter ) { - if ( postFinder ) { - // Get the final matcherOut by condensing this intermediate into postFinder contexts - temp = []; - i = matcherOut.length; - while ( i-- ) { - if ( (elem = matcherOut[i]) ) { - // Restore matcherIn since elem is not yet a final match - temp.push( (matcherIn[i] = elem) ); - } - } - postFinder( null, (matcherOut = []), temp, xml ); - } - - // Move matched elements from seed to results to keep them synchronized - i = matcherOut.length; - while ( i-- ) { - if ( (elem = matcherOut[i]) && - (temp = postFinder ? indexOf( seed, elem ) : preMap[i]) > -1 ) { - - seed[temp] = !(results[temp] = elem); - } - } - } - - // Add elements to results, through postFinder if defined - } else { - matcherOut = condense( - matcherOut === results ? - matcherOut.splice( preexisting, matcherOut.length ) : - matcherOut - ); - if ( postFinder ) { - postFinder( null, results, matcherOut, xml ); - } else { - push.apply( results, matcherOut ); - } - } - }); -} - -function matcherFromTokens( tokens ) { - var checkContext, matcher, j, - len = tokens.length, - leadingRelative = Expr.relative[ tokens[0].type ], - implicitRelative = leadingRelative || Expr.relative[" "], - i = leadingRelative ? 1 : 0, - - // The foundational matcher ensures that elements are reachable from top-level context(s) - matchContext = addCombinator( function( elem ) { - return elem === checkContext; - }, implicitRelative, true ), - matchAnyContext = addCombinator( function( elem ) { - return indexOf( checkContext, elem ) > -1; - }, implicitRelative, true ), - matchers = [ function( elem, context, xml ) { - var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( - (checkContext = context).nodeType ? - matchContext( elem, context, xml ) : - matchAnyContext( elem, context, xml ) ); - // Avoid hanging onto element (issue #299) - checkContext = null; - return ret; - } ]; - - for ( ; i < len; i++ ) { - if ( (matcher = Expr.relative[ tokens[i].type ]) ) { - matchers = [ addCombinator(elementMatcher( matchers ), matcher) ]; - } else { - matcher = Expr.filter[ tokens[i].type ].apply( null, tokens[i].matches ); - - // Return special upon seeing a positional matcher - if ( matcher[ expando ] ) { - // Find the next relative operator (if any) for proper handling - j = ++i; - for ( ; j < len; j++ ) { - if ( Expr.relative[ tokens[j].type ] ) { - break; - } - } - return setMatcher( - i > 1 && elementMatcher( matchers ), - i > 1 && toSelector( - // If the preceding token was a descendant combinator, insert an implicit any-element `*` - tokens.slice( 0, i - 1 ).concat({ value: tokens[ i - 2 ].type === " " ? "*" : "" }) - ).replace( rtrim, "$1" ), - matcher, - i < j && matcherFromTokens( tokens.slice( i, j ) ), - j < len && matcherFromTokens( (tokens = tokens.slice( j )) ), - j < len && toSelector( tokens ) - ); - } - matchers.push( matcher ); - } - } - - return elementMatcher( matchers ); -} - -function matcherFromGroupMatchers( elementMatchers, setMatchers ) { - var bySet = setMatchers.length > 0, - byElement = elementMatchers.length > 0, - superMatcher = function( seed, context, xml, results, outermost ) { - var elem, j, matcher, - matchedCount = 0, - i = "0", - unmatched = seed && [], - setMatched = [], - contextBackup = outermostContext, - // We must always have either seed elements or outermost context - elems = seed || byElement && Expr.find["TAG"]( "*", outermost ), - // Use integer dirruns iff this is the outermost matcher - dirrunsUnique = (dirruns += contextBackup == null ? 1 : Math.random() || 0.1), - len = elems.length; - - if ( outermost ) { - outermostContext = context === document || context || outermost; - } - - // Add elements passing elementMatchers directly to results - // Support: IE<9, Safari - // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id - for ( ; i !== len && (elem = elems[i]) != null; i++ ) { - if ( byElement && elem ) { - j = 0; - if ( !context && elem.ownerDocument !== document ) { - setDocument( elem ); - xml = !documentIsHTML; - } - while ( (matcher = elementMatchers[j++]) ) { - if ( matcher( elem, context || document, xml) ) { - results.push( elem ); - break; - } - } - if ( outermost ) { - dirruns = dirrunsUnique; - } - } - - // Track unmatched elements for set filters - if ( bySet ) { - // They will have gone through all possible matchers - if ( (elem = !matcher && elem) ) { - matchedCount--; - } - - // Lengthen the array for every element, matched or not - if ( seed ) { - unmatched.push( elem ); - } - } - } - - // `i` is now the count of elements visited above, and adding it to `matchedCount` - // makes the latter nonnegative. - matchedCount += i; - - // Apply set filters to unmatched elements - // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` - // equals `i`), unless we didn't visit _any_ elements in the above loop because we have - // no element matchers and no seed. - // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that - // case, which will result in a "00" `matchedCount` that differs from `i` but is also - // numerically zero. - if ( bySet && i !== matchedCount ) { - j = 0; - while ( (matcher = setMatchers[j++]) ) { - matcher( unmatched, setMatched, context, xml ); - } - - if ( seed ) { - // Reintegrate element matches to eliminate the need for sorting - if ( matchedCount > 0 ) { - while ( i-- ) { - if ( !(unmatched[i] || setMatched[i]) ) { - setMatched[i] = pop.call( results ); - } - } - } - - // Discard index placeholder values to get only actual matches - setMatched = condense( setMatched ); - } - - // Add matches to results - push.apply( results, setMatched ); - - // Seedless set matches succeeding multiple successful matchers stipulate sorting - if ( outermost && !seed && setMatched.length > 0 && - ( matchedCount + setMatchers.length ) > 1 ) { - - Sizzle.uniqueSort( results ); - } - } - - // Override manipulation of globals by nested matchers - if ( outermost ) { - dirruns = dirrunsUnique; - outermostContext = contextBackup; - } - - return unmatched; - }; - - return bySet ? - markFunction( superMatcher ) : - superMatcher; -} - -compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { - var i, - setMatchers = [], - elementMatchers = [], - cached = compilerCache[ selector + " " ]; - - if ( !cached ) { - // Generate a function of recursive functions that can be used to check each element - if ( !match ) { - match = tokenize( selector ); - } - i = match.length; - while ( i-- ) { - cached = matcherFromTokens( match[i] ); - if ( cached[ expando ] ) { - setMatchers.push( cached ); - } else { - elementMatchers.push( cached ); - } - } - - // Cache the compiled function - cached = compilerCache( selector, matcherFromGroupMatchers( elementMatchers, setMatchers ) ); - - // Save selector and tokenization - cached.selector = selector; - } - return cached; -}; - -/** - * A low-level selection function that works with Sizzle's compiled - * selector functions - * @param {String|Function} selector A selector or a pre-compiled - * selector function built with Sizzle.compile - * @param {Element} context - * @param {Array} [results] - * @param {Array} [seed] A set of elements to match against - */ -select = Sizzle.select = function( selector, context, results, seed ) { - var i, tokens, token, type, find, - compiled = typeof selector === "function" && selector, - match = !seed && tokenize( (selector = compiled.selector || selector) ); - - results = results || []; - - // Try to minimize operations if there is only one selector in the list and no seed - // (the latter of which guarantees us context) - if ( match.length === 1 ) { - - // Reduce context if the leading compound selector is an ID - tokens = match[0] = match[0].slice( 0 ); - if ( tokens.length > 2 && (token = tokens[0]).type === "ID" && - context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[1].type ] ) { - - context = ( Expr.find["ID"]( token.matches[0].replace(runescape, funescape), context ) || [] )[0]; - if ( !context ) { - return results; - - // Precompiled matchers will still verify ancestry, so step up a level - } else if ( compiled ) { - context = context.parentNode; - } - - selector = selector.slice( tokens.shift().value.length ); - } - - // Fetch a seed set for right-to-left matching - i = matchExpr["needsContext"].test( selector ) ? 0 : tokens.length; - while ( i-- ) { - token = tokens[i]; - - // Abort if we hit a combinator - if ( Expr.relative[ (type = token.type) ] ) { - break; - } - if ( (find = Expr.find[ type ]) ) { - // Search, expanding context for leading sibling combinators - if ( (seed = find( - token.matches[0].replace( runescape, funescape ), - rsibling.test( tokens[0].type ) && testContext( context.parentNode ) || context - )) ) { - - // If seed is empty or no tokens remain, we can return early - tokens.splice( i, 1 ); - selector = seed.length && toSelector( tokens ); - if ( !selector ) { - push.apply( results, seed ); - return results; - } - - break; - } - } - } - } - - // Compile and execute a filtering function if one is not provided - // Provide `match` to avoid retokenization if we modified the selector above - ( compiled || compile( selector, match ) )( - seed, - context, - !documentIsHTML, - results, - !context || rsibling.test( selector ) && testContext( context.parentNode ) || context - ); - return results; -}; - -// One-time assignments - -// Sort stability -support.sortStable = expando.split("").sort( sortOrder ).join("") === expando; - -// Support: Chrome 14-35+ -// Always assume duplicates if they aren't passed to the comparison function -support.detectDuplicates = !!hasDuplicate; - -// Initialize against the default document -setDocument(); - -// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) -// Detached nodes confoundingly follow *each other* -support.sortDetached = assert(function( el ) { - // Should return 1, but returns 4 (following) - return el.compareDocumentPosition( document.createElement("fieldset") ) & 1; -}); - -// Support: IE<8 -// Prevent attribute/property "interpolation" -// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx -if ( !assert(function( el ) { - el.innerHTML = ""; - return el.firstChild.getAttribute("href") === "#" ; -}) ) { - addHandle( "type|href|height|width", function( elem, name, isXML ) { - if ( !isXML ) { - return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); - } - }); -} - -// Support: IE<9 -// Use defaultValue in place of getAttribute("value") -if ( !support.attributes || !assert(function( el ) { - el.innerHTML = ""; - el.firstChild.setAttribute( "value", "" ); - return el.firstChild.getAttribute( "value" ) === ""; -}) ) { - addHandle( "value", function( elem, name, isXML ) { - if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { - return elem.defaultValue; - } - }); -} - -// Support: IE<9 -// Use getAttributeNode to fetch booleans when getAttribute lies -if ( !assert(function( el ) { - return el.getAttribute("disabled") == null; -}) ) { - addHandle( booleans, function( elem, name, isXML ) { - var val; - if ( !isXML ) { - return elem[ name ] === true ? name.toLowerCase() : - (val = elem.getAttributeNode( name )) && val.specified ? - val.value : - null; - } - }); -} - -return Sizzle; - -})( window ); - - - -jQuery.find = Sizzle; -jQuery.expr = Sizzle.selectors; - -// Deprecated -jQuery.expr[ ":" ] = jQuery.expr.pseudos; -jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; -jQuery.text = Sizzle.getText; -jQuery.isXMLDoc = Sizzle.isXML; -jQuery.contains = Sizzle.contains; -jQuery.escapeSelector = Sizzle.escape; - - - - -var dir = function( elem, dir, until ) { - var matched = [], - truncate = until !== undefined; - - while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { - if ( elem.nodeType === 1 ) { - if ( truncate && jQuery( elem ).is( until ) ) { - break; - } - matched.push( elem ); - } - } - return matched; -}; - - -var siblings = function( n, elem ) { - var matched = []; - - for ( ; n; n = n.nextSibling ) { - if ( n.nodeType === 1 && n !== elem ) { - matched.push( n ); - } - } - - return matched; -}; - - -var rneedsContext = jQuery.expr.match.needsContext; - - - -function nodeName( elem, name ) { - - return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); - -}; -var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); - - - -var risSimple = /^.[^:#\[\.,]*$/; - -// Implement the identical functionality for filter and not -function winnow( elements, qualifier, not ) { - if ( jQuery.isFunction( qualifier ) ) { - return jQuery.grep( elements, function( elem, i ) { - return !!qualifier.call( elem, i, elem ) !== not; - } ); - } - - // Single element - if ( qualifier.nodeType ) { - return jQuery.grep( elements, function( elem ) { - return ( elem === qualifier ) !== not; - } ); - } - - // Arraylike of elements (jQuery, arguments, Array) - if ( typeof qualifier !== "string" ) { - return jQuery.grep( elements, function( elem ) { - return ( indexOf.call( qualifier, elem ) > -1 ) !== not; - } ); - } - - // Simple selector that can be filtered directly, removing non-Elements - if ( risSimple.test( qualifier ) ) { - return jQuery.filter( qualifier, elements, not ); - } - - // Complex selector, compare the two sets, removing non-Elements - qualifier = jQuery.filter( qualifier, elements ); - return jQuery.grep( elements, function( elem ) { - return ( indexOf.call( qualifier, elem ) > -1 ) !== not && elem.nodeType === 1; - } ); -} - -jQuery.filter = function( expr, elems, not ) { - var elem = elems[ 0 ]; - - if ( not ) { - expr = ":not(" + expr + ")"; - } - - if ( elems.length === 1 && elem.nodeType === 1 ) { - return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; - } - - return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { - return elem.nodeType === 1; - } ) ); -}; - -jQuery.fn.extend( { - find: function( selector ) { - var i, ret, - len = this.length, - self = this; - - if ( typeof selector !== "string" ) { - return this.pushStack( jQuery( selector ).filter( function() { - for ( i = 0; i < len; i++ ) { - if ( jQuery.contains( self[ i ], this ) ) { - return true; - } - } - } ) ); - } - - ret = this.pushStack( [] ); - - for ( i = 0; i < len; i++ ) { - jQuery.find( selector, self[ i ], ret ); - } - - return len > 1 ? jQuery.uniqueSort( ret ) : ret; - }, - filter: function( selector ) { - return this.pushStack( winnow( this, selector || [], false ) ); - }, - not: function( selector ) { - return this.pushStack( winnow( this, selector || [], true ) ); - }, - is: function( selector ) { - return !!winnow( - this, - - // If this is a positional/relative selector, check membership in the returned set - // so $("p:first").is("p:last") won't return true for a doc with two "p". - typeof selector === "string" && rneedsContext.test( selector ) ? - jQuery( selector ) : - selector || [], - false - ).length; - } -} ); - - -// Initialize a jQuery object - - -// A central reference to the root jQuery(document) -var rootjQuery, - - // A simple way to check for HTML strings - // Prioritize #id over to avoid XSS via location.hash (#9521) - // Strict HTML recognition (#11290: must start with <) - // Shortcut simple #id case for speed - rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, - - init = jQuery.fn.init = function( selector, context, root ) { - var match, elem; - - // HANDLE: $(""), $(null), $(undefined), $(false) - if ( !selector ) { - return this; - } - - // Method init() accepts an alternate rootjQuery - // so migrate can support jQuery.sub (gh-2101) - root = root || rootjQuery; - - // Handle HTML strings - if ( typeof selector === "string" ) { - if ( selector[ 0 ] === "<" && - selector[ selector.length - 1 ] === ">" && - selector.length >= 3 ) { - - // Assume that strings that start and end with <> are HTML and skip the regex check - match = [ null, selector, null ]; - - } else { - match = rquickExpr.exec( selector ); - } - - // Match html or make sure no context is specified for #id - if ( match && ( match[ 1 ] || !context ) ) { - - // HANDLE: $(html) -> $(array) - if ( match[ 1 ] ) { - context = context instanceof jQuery ? context[ 0 ] : context; - - // Option to run scripts is true for back-compat - // Intentionally let the error be thrown if parseHTML is not present - jQuery.merge( this, jQuery.parseHTML( - match[ 1 ], - context && context.nodeType ? context.ownerDocument || context : document, - true - ) ); - - // HANDLE: $(html, props) - if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { - for ( match in context ) { - - // Properties of context are called as methods if possible - if ( jQuery.isFunction( this[ match ] ) ) { - this[ match ]( context[ match ] ); - - // ...and otherwise set as attributes - } else { - this.attr( match, context[ match ] ); - } - } - } - - return this; - - // HANDLE: $(#id) - } else { - elem = document.getElementById( match[ 2 ] ); - - if ( elem ) { - - // Inject the element directly into the jQuery object - this[ 0 ] = elem; - this.length = 1; - } - return this; - } - - // HANDLE: $(expr, $(...)) - } else if ( !context || context.jquery ) { - return ( context || root ).find( selector ); - - // HANDLE: $(expr, context) - // (which is just equivalent to: $(context).find(expr) - } else { - return this.constructor( context ).find( selector ); - } - - // HANDLE: $(DOMElement) - } else if ( selector.nodeType ) { - this[ 0 ] = selector; - this.length = 1; - return this; - - // HANDLE: $(function) - // Shortcut for document ready - } else if ( jQuery.isFunction( selector ) ) { - return root.ready !== undefined ? - root.ready( selector ) : - - // Execute immediately if ready is not present - selector( jQuery ); - } - - return jQuery.makeArray( selector, this ); - }; - -// Give the init function the jQuery prototype for later instantiation -init.prototype = jQuery.fn; - -// Initialize central reference -rootjQuery = jQuery( document ); - - -var rparentsprev = /^(?:parents|prev(?:Until|All))/, - - // Methods guaranteed to produce a unique set when starting from a unique set - guaranteedUnique = { - children: true, - contents: true, - next: true, - prev: true - }; - -jQuery.fn.extend( { - has: function( target ) { - var targets = jQuery( target, this ), - l = targets.length; - - return this.filter( function() { - var i = 0; - for ( ; i < l; i++ ) { - if ( jQuery.contains( this, targets[ i ] ) ) { - return true; - } - } - } ); - }, - - closest: function( selectors, context ) { - var cur, - i = 0, - l = this.length, - matched = [], - targets = typeof selectors !== "string" && jQuery( selectors ); - - // Positional selectors never match, since there's no _selection_ context - if ( !rneedsContext.test( selectors ) ) { - for ( ; i < l; i++ ) { - for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { - - // Always skip document fragments - if ( cur.nodeType < 11 && ( targets ? - targets.index( cur ) > -1 : - - // Don't pass non-elements to Sizzle - cur.nodeType === 1 && - jQuery.find.matchesSelector( cur, selectors ) ) ) { - - matched.push( cur ); - break; - } - } - } - } - - return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); - }, - - // Determine the position of an element within the set - index: function( elem ) { - - // No argument, return index in parent - if ( !elem ) { - return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; - } - - // Index in selector - if ( typeof elem === "string" ) { - return indexOf.call( jQuery( elem ), this[ 0 ] ); - } - - // Locate the position of the desired element - return indexOf.call( this, - - // If it receives a jQuery object, the first element is used - elem.jquery ? elem[ 0 ] : elem - ); - }, - - add: function( selector, context ) { - return this.pushStack( - jQuery.uniqueSort( - jQuery.merge( this.get(), jQuery( selector, context ) ) - ) - ); - }, - - addBack: function( selector ) { - return this.add( selector == null ? - this.prevObject : this.prevObject.filter( selector ) - ); - } -} ); - -function sibling( cur, dir ) { - while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} - return cur; -} - -jQuery.each( { - parent: function( elem ) { - var parent = elem.parentNode; - return parent && parent.nodeType !== 11 ? parent : null; - }, - parents: function( elem ) { - return dir( elem, "parentNode" ); - }, - parentsUntil: function( elem, i, until ) { - return dir( elem, "parentNode", until ); - }, - next: function( elem ) { - return sibling( elem, "nextSibling" ); - }, - prev: function( elem ) { - return sibling( elem, "previousSibling" ); - }, - nextAll: function( elem ) { - return dir( elem, "nextSibling" ); - }, - prevAll: function( elem ) { - return dir( elem, "previousSibling" ); - }, - nextUntil: function( elem, i, until ) { - return dir( elem, "nextSibling", until ); - }, - prevUntil: function( elem, i, until ) { - return dir( elem, "previousSibling", until ); - }, - siblings: function( elem ) { - return siblings( ( elem.parentNode || {} ).firstChild, elem ); - }, - children: function( elem ) { - return siblings( elem.firstChild ); - }, - contents: function( elem ) { - if ( nodeName( elem, "iframe" ) ) { - return elem.contentDocument; - } - - // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only - // Treat the template element as a regular one in browsers that - // don't support it. - if ( nodeName( elem, "template" ) ) { - elem = elem.content || elem; - } - - return jQuery.merge( [], elem.childNodes ); - } -}, function( name, fn ) { - jQuery.fn[ name ] = function( until, selector ) { - var matched = jQuery.map( this, fn, until ); - - if ( name.slice( -5 ) !== "Until" ) { - selector = until; - } - - if ( selector && typeof selector === "string" ) { - matched = jQuery.filter( selector, matched ); - } - - if ( this.length > 1 ) { - - // Remove duplicates - if ( !guaranteedUnique[ name ] ) { - jQuery.uniqueSort( matched ); - } - - // Reverse order for parents* and prev-derivatives - if ( rparentsprev.test( name ) ) { - matched.reverse(); - } - } - - return this.pushStack( matched ); - }; -} ); -var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); - - - -// Convert String-formatted options into Object-formatted ones -function createOptions( options ) { - var object = {}; - jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { - object[ flag ] = true; - } ); - return object; -} - -/* - * Create a callback list using the following parameters: - * - * options: an optional list of space-separated options that will change how - * the callback list behaves or a more traditional option object - * - * By default a callback list will act like an event callback list and can be - * "fired" multiple times. - * - * Possible options: - * - * once: will ensure the callback list can only be fired once (like a Deferred) - * - * memory: will keep track of previous values and will call any callback added - * after the list has been fired right away with the latest "memorized" - * values (like a Deferred) - * - * unique: will ensure a callback can only be added once (no duplicate in the list) - * - * stopOnFalse: interrupt callings when a callback returns false - * - */ -jQuery.Callbacks = function( options ) { - - // Convert options from String-formatted to Object-formatted if needed - // (we check in cache first) - options = typeof options === "string" ? - createOptions( options ) : - jQuery.extend( {}, options ); - - var // Flag to know if list is currently firing - firing, - - // Last fire value for non-forgettable lists - memory, - - // Flag to know if list was already fired - fired, - - // Flag to prevent firing - locked, - - // Actual callback list - list = [], - - // Queue of execution data for repeatable lists - queue = [], - - // Index of currently firing callback (modified by add/remove as needed) - firingIndex = -1, - - // Fire callbacks - fire = function() { - - // Enforce single-firing - locked = locked || options.once; - - // Execute callbacks for all pending executions, - // respecting firingIndex overrides and runtime changes - fired = firing = true; - for ( ; queue.length; firingIndex = -1 ) { - memory = queue.shift(); - while ( ++firingIndex < list.length ) { - - // Run callback and check for early termination - if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && - options.stopOnFalse ) { - - // Jump to end and forget the data so .add doesn't re-fire - firingIndex = list.length; - memory = false; - } - } - } - - // Forget the data if we're done with it - if ( !options.memory ) { - memory = false; - } - - firing = false; - - // Clean up if we're done firing for good - if ( locked ) { - - // Keep an empty list if we have data for future add calls - if ( memory ) { - list = []; - - // Otherwise, this object is spent - } else { - list = ""; - } - } - }, - - // Actual Callbacks object - self = { - - // Add a callback or a collection of callbacks to the list - add: function() { - if ( list ) { - - // If we have memory from a past run, we should fire after adding - if ( memory && !firing ) { - firingIndex = list.length - 1; - queue.push( memory ); - } - - ( function add( args ) { - jQuery.each( args, function( _, arg ) { - if ( jQuery.isFunction( arg ) ) { - if ( !options.unique || !self.has( arg ) ) { - list.push( arg ); - } - } else if ( arg && arg.length && jQuery.type( arg ) !== "string" ) { - - // Inspect recursively - add( arg ); - } - } ); - } )( arguments ); - - if ( memory && !firing ) { - fire(); - } - } - return this; - }, - - // Remove a callback from the list - remove: function() { - jQuery.each( arguments, function( _, arg ) { - var index; - while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { - list.splice( index, 1 ); - - // Handle firing indexes - if ( index <= firingIndex ) { - firingIndex--; - } - } - } ); - return this; - }, - - // Check if a given callback is in the list. - // If no argument is given, return whether or not list has callbacks attached. - has: function( fn ) { - return fn ? - jQuery.inArray( fn, list ) > -1 : - list.length > 0; - }, - - // Remove all callbacks from the list - empty: function() { - if ( list ) { - list = []; - } - return this; - }, - - // Disable .fire and .add - // Abort any current/pending executions - // Clear all callbacks and values - disable: function() { - locked = queue = []; - list = memory = ""; - return this; - }, - disabled: function() { - return !list; - }, - - // Disable .fire - // Also disable .add unless we have memory (since it would have no effect) - // Abort any pending executions - lock: function() { - locked = queue = []; - if ( !memory && !firing ) { - list = memory = ""; - } - return this; - }, - locked: function() { - return !!locked; - }, - - // Call all callbacks with the given context and arguments - fireWith: function( context, args ) { - if ( !locked ) { - args = args || []; - args = [ context, args.slice ? args.slice() : args ]; - queue.push( args ); - if ( !firing ) { - fire(); - } - } - return this; - }, - - // Call all the callbacks with the given arguments - fire: function() { - self.fireWith( this, arguments ); - return this; - }, - - // To know if the callbacks have already been called at least once - fired: function() { - return !!fired; - } - }; - - return self; -}; - - -function Identity( v ) { - return v; -} -function Thrower( ex ) { - throw ex; -} - -function adoptValue( value, resolve, reject, noValue ) { - var method; - - try { - - // Check for promise aspect first to privilege synchronous behavior - if ( value && jQuery.isFunction( ( method = value.promise ) ) ) { - method.call( value ).done( resolve ).fail( reject ); - - // Other thenables - } else if ( value && jQuery.isFunction( ( method = value.then ) ) ) { - method.call( value, resolve, reject ); - - // Other non-thenables - } else { - - // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: - // * false: [ value ].slice( 0 ) => resolve( value ) - // * true: [ value ].slice( 1 ) => resolve() - resolve.apply( undefined, [ value ].slice( noValue ) ); - } - - // For Promises/A+, convert exceptions into rejections - // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in - // Deferred#then to conditionally suppress rejection. - } catch ( value ) { - - // Support: Android 4.0 only - // Strict mode functions invoked without .call/.apply get global-object context - reject.apply( undefined, [ value ] ); - } -} - -jQuery.extend( { - - Deferred: function( func ) { - var tuples = [ - - // action, add listener, callbacks, - // ... .then handlers, argument index, [final state] - [ "notify", "progress", jQuery.Callbacks( "memory" ), - jQuery.Callbacks( "memory" ), 2 ], - [ "resolve", "done", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 0, "resolved" ], - [ "reject", "fail", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 1, "rejected" ] - ], - state = "pending", - promise = { - state: function() { - return state; - }, - always: function() { - deferred.done( arguments ).fail( arguments ); - return this; - }, - "catch": function( fn ) { - return promise.then( null, fn ); - }, - - // Keep pipe for back-compat - pipe: function( /* fnDone, fnFail, fnProgress */ ) { - var fns = arguments; - - return jQuery.Deferred( function( newDefer ) { - jQuery.each( tuples, function( i, tuple ) { - - // Map tuples (progress, done, fail) to arguments (done, fail, progress) - var fn = jQuery.isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; - - // deferred.progress(function() { bind to newDefer or newDefer.notify }) - // deferred.done(function() { bind to newDefer or newDefer.resolve }) - // deferred.fail(function() { bind to newDefer or newDefer.reject }) - deferred[ tuple[ 1 ] ]( function() { - var returned = fn && fn.apply( this, arguments ); - if ( returned && jQuery.isFunction( returned.promise ) ) { - returned.promise() - .progress( newDefer.notify ) - .done( newDefer.resolve ) - .fail( newDefer.reject ); - } else { - newDefer[ tuple[ 0 ] + "With" ]( - this, - fn ? [ returned ] : arguments - ); - } - } ); - } ); - fns = null; - } ).promise(); - }, - then: function( onFulfilled, onRejected, onProgress ) { - var maxDepth = 0; - function resolve( depth, deferred, handler, special ) { - return function() { - var that = this, - args = arguments, - mightThrow = function() { - var returned, then; - - // Support: Promises/A+ section 2.3.3.3.3 - // https://promisesaplus.com/#point-59 - // Ignore double-resolution attempts - if ( depth < maxDepth ) { - return; - } - - returned = handler.apply( that, args ); - - // Support: Promises/A+ section 2.3.1 - // https://promisesaplus.com/#point-48 - if ( returned === deferred.promise() ) { - throw new TypeError( "Thenable self-resolution" ); - } - - // Support: Promises/A+ sections 2.3.3.1, 3.5 - // https://promisesaplus.com/#point-54 - // https://promisesaplus.com/#point-75 - // Retrieve `then` only once - then = returned && - - // Support: Promises/A+ section 2.3.4 - // https://promisesaplus.com/#point-64 - // Only check objects and functions for thenability - ( typeof returned === "object" || - typeof returned === "function" ) && - returned.then; - - // Handle a returned thenable - if ( jQuery.isFunction( then ) ) { - - // Special processors (notify) just wait for resolution - if ( special ) { - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ) - ); - - // Normal processors (resolve) also hook into progress - } else { - - // ...and disregard older resolution values - maxDepth++; - - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ), - resolve( maxDepth, deferred, Identity, - deferred.notifyWith ) - ); - } - - // Handle all other returned values - } else { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Identity ) { - that = undefined; - args = [ returned ]; - } - - // Process the value(s) - // Default process is resolve - ( special || deferred.resolveWith )( that, args ); - } - }, - - // Only normal processors (resolve) catch and reject exceptions - process = special ? - mightThrow : - function() { - try { - mightThrow(); - } catch ( e ) { - - if ( jQuery.Deferred.exceptionHook ) { - jQuery.Deferred.exceptionHook( e, - process.stackTrace ); - } - - // Support: Promises/A+ section 2.3.3.3.4.1 - // https://promisesaplus.com/#point-61 - // Ignore post-resolution exceptions - if ( depth + 1 >= maxDepth ) { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Thrower ) { - that = undefined; - args = [ e ]; - } - - deferred.rejectWith( that, args ); - } - } - }; - - // Support: Promises/A+ section 2.3.3.3.1 - // https://promisesaplus.com/#point-57 - // Re-resolve promises immediately to dodge false rejection from - // subsequent errors - if ( depth ) { - process(); - } else { - - // Call an optional hook to record the stack, in case of exception - // since it's otherwise lost when execution goes async - if ( jQuery.Deferred.getStackHook ) { - process.stackTrace = jQuery.Deferred.getStackHook(); - } - window.setTimeout( process ); - } - }; - } - - return jQuery.Deferred( function( newDefer ) { - - // progress_handlers.add( ... ) - tuples[ 0 ][ 3 ].add( - resolve( - 0, - newDefer, - jQuery.isFunction( onProgress ) ? - onProgress : - Identity, - newDefer.notifyWith - ) - ); - - // fulfilled_handlers.add( ... ) - tuples[ 1 ][ 3 ].add( - resolve( - 0, - newDefer, - jQuery.isFunction( onFulfilled ) ? - onFulfilled : - Identity - ) - ); - - // rejected_handlers.add( ... ) - tuples[ 2 ][ 3 ].add( - resolve( - 0, - newDefer, - jQuery.isFunction( onRejected ) ? - onRejected : - Thrower - ) - ); - } ).promise(); - }, - - // Get a promise for this deferred - // If obj is provided, the promise aspect is added to the object - promise: function( obj ) { - return obj != null ? jQuery.extend( obj, promise ) : promise; - } - }, - deferred = {}; - - // Add list-specific methods - jQuery.each( tuples, function( i, tuple ) { - var list = tuple[ 2 ], - stateString = tuple[ 5 ]; - - // promise.progress = list.add - // promise.done = list.add - // promise.fail = list.add - promise[ tuple[ 1 ] ] = list.add; - - // Handle state - if ( stateString ) { - list.add( - function() { - - // state = "resolved" (i.e., fulfilled) - // state = "rejected" - state = stateString; - }, - - // rejected_callbacks.disable - // fulfilled_callbacks.disable - tuples[ 3 - i ][ 2 ].disable, - - // progress_callbacks.lock - tuples[ 0 ][ 2 ].lock - ); - } - - // progress_handlers.fire - // fulfilled_handlers.fire - // rejected_handlers.fire - list.add( tuple[ 3 ].fire ); - - // deferred.notify = function() { deferred.notifyWith(...) } - // deferred.resolve = function() { deferred.resolveWith(...) } - // deferred.reject = function() { deferred.rejectWith(...) } - deferred[ tuple[ 0 ] ] = function() { - deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); - return this; - }; - - // deferred.notifyWith = list.fireWith - // deferred.resolveWith = list.fireWith - // deferred.rejectWith = list.fireWith - deferred[ tuple[ 0 ] + "With" ] = list.fireWith; - } ); - - // Make the deferred a promise - promise.promise( deferred ); - - // Call given func if any - if ( func ) { - func.call( deferred, deferred ); - } - - // All done! - return deferred; - }, - - // Deferred helper - when: function( singleValue ) { - var - - // count of uncompleted subordinates - remaining = arguments.length, - - // count of unprocessed arguments - i = remaining, - - // subordinate fulfillment data - resolveContexts = Array( i ), - resolveValues = slice.call( arguments ), - - // the master Deferred - master = jQuery.Deferred(), - - // subordinate callback factory - updateFunc = function( i ) { - return function( value ) { - resolveContexts[ i ] = this; - resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; - if ( !( --remaining ) ) { - master.resolveWith( resolveContexts, resolveValues ); - } - }; - }; - - // Single- and empty arguments are adopted like Promise.resolve - if ( remaining <= 1 ) { - adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, - !remaining ); - - // Use .then() to unwrap secondary thenables (cf. gh-3000) - if ( master.state() === "pending" || - jQuery.isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { - - return master.then(); - } - } - - // Multiple arguments are aggregated like Promise.all array elements - while ( i-- ) { - adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); - } - - return master.promise(); - } -} ); - - -// These usually indicate a programmer mistake during development, -// warn about them ASAP rather than swallowing them by default. -var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; - -jQuery.Deferred.exceptionHook = function( error, stack ) { - - // Support: IE 8 - 9 only - // Console exists when dev tools are open, which can happen at any time - if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { - window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); - } -}; - - - - -jQuery.readyException = function( error ) { - window.setTimeout( function() { - throw error; - } ); -}; - - - - -// The deferred used on DOM ready -var readyList = jQuery.Deferred(); - -jQuery.fn.ready = function( fn ) { - - readyList - .then( fn ) - - // Wrap jQuery.readyException in a function so that the lookup - // happens at the time of error handling instead of callback - // registration. - .catch( function( error ) { - jQuery.readyException( error ); - } ); - - return this; -}; - -jQuery.extend( { - - // Is the DOM ready to be used? Set to true once it occurs. - isReady: false, - - // A counter to track how many items to wait for before - // the ready event fires. See #6781 - readyWait: 1, - - // Handle when the DOM is ready - ready: function( wait ) { - - // Abort if there are pending holds or we're already ready - if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { - return; - } - - // Remember that the DOM is ready - jQuery.isReady = true; - - // If a normal DOM Ready event fired, decrement, and wait if need be - if ( wait !== true && --jQuery.readyWait > 0 ) { - return; - } - - // If there are functions bound, to execute - readyList.resolveWith( document, [ jQuery ] ); - } -} ); - -jQuery.ready.then = readyList.then; - -// The ready event handler and self cleanup method -function completed() { - document.removeEventListener( "DOMContentLoaded", completed ); - window.removeEventListener( "load", completed ); - jQuery.ready(); -} - -// Catch cases where $(document).ready() is called -// after the browser event has already occurred. -// Support: IE <=9 - 10 only -// Older IE sometimes signals "interactive" too soon -if ( document.readyState === "complete" || - ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { - - // Handle it asynchronously to allow scripts the opportunity to delay ready - window.setTimeout( jQuery.ready ); - -} else { - - // Use the handy event callback - document.addEventListener( "DOMContentLoaded", completed ); - - // A fallback to window.onload, that will always work - window.addEventListener( "load", completed ); -} - - - - -// Multifunctional method to get and set values of a collection -// The value/s can optionally be executed if it's a function -var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { - var i = 0, - len = elems.length, - bulk = key == null; - - // Sets many values - if ( jQuery.type( key ) === "object" ) { - chainable = true; - for ( i in key ) { - access( elems, fn, i, key[ i ], true, emptyGet, raw ); - } - - // Sets one value - } else if ( value !== undefined ) { - chainable = true; - - if ( !jQuery.isFunction( value ) ) { - raw = true; - } - - if ( bulk ) { - - // Bulk operations run against the entire set - if ( raw ) { - fn.call( elems, value ); - fn = null; - - // ...except when executing function values - } else { - bulk = fn; - fn = function( elem, key, value ) { - return bulk.call( jQuery( elem ), value ); - }; - } - } - - if ( fn ) { - for ( ; i < len; i++ ) { - fn( - elems[ i ], key, raw ? - value : - value.call( elems[ i ], i, fn( elems[ i ], key ) ) - ); - } - } - } - - if ( chainable ) { - return elems; - } - - // Gets - if ( bulk ) { - return fn.call( elems ); - } - - return len ? fn( elems[ 0 ], key ) : emptyGet; -}; -var acceptData = function( owner ) { - - // Accepts only: - // - Node - // - Node.ELEMENT_NODE - // - Node.DOCUMENT_NODE - // - Object - // - Any - return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); -}; - - - - -function Data() { - this.expando = jQuery.expando + Data.uid++; -} - -Data.uid = 1; - -Data.prototype = { - - cache: function( owner ) { - - // Check if the owner object already has a cache - var value = owner[ this.expando ]; - - // If not, create one - if ( !value ) { - value = {}; - - // We can accept data for non-element nodes in modern browsers, - // but we should not, see #8335. - // Always return an empty object. - if ( acceptData( owner ) ) { - - // If it is a node unlikely to be stringify-ed or looped over - // use plain assignment - if ( owner.nodeType ) { - owner[ this.expando ] = value; - - // Otherwise secure it in a non-enumerable property - // configurable must be true to allow the property to be - // deleted when data is removed - } else { - Object.defineProperty( owner, this.expando, { - value: value, - configurable: true - } ); - } - } - } - - return value; - }, - set: function( owner, data, value ) { - var prop, - cache = this.cache( owner ); - - // Handle: [ owner, key, value ] args - // Always use camelCase key (gh-2257) - if ( typeof data === "string" ) { - cache[ jQuery.camelCase( data ) ] = value; - - // Handle: [ owner, { properties } ] args - } else { - - // Copy the properties one-by-one to the cache object - for ( prop in data ) { - cache[ jQuery.camelCase( prop ) ] = data[ prop ]; - } - } - return cache; - }, - get: function( owner, key ) { - return key === undefined ? - this.cache( owner ) : - - // Always use camelCase key (gh-2257) - owner[ this.expando ] && owner[ this.expando ][ jQuery.camelCase( key ) ]; - }, - access: function( owner, key, value ) { - - // In cases where either: - // - // 1. No key was specified - // 2. A string key was specified, but no value provided - // - // Take the "read" path and allow the get method to determine - // which value to return, respectively either: - // - // 1. The entire cache object - // 2. The data stored at the key - // - if ( key === undefined || - ( ( key && typeof key === "string" ) && value === undefined ) ) { - - return this.get( owner, key ); - } - - // When the key is not a string, or both a key and value - // are specified, set or extend (existing objects) with either: - // - // 1. An object of properties - // 2. A key and value - // - this.set( owner, key, value ); - - // Since the "set" path can have two possible entry points - // return the expected data based on which path was taken[*] - return value !== undefined ? value : key; - }, - remove: function( owner, key ) { - var i, - cache = owner[ this.expando ]; - - if ( cache === undefined ) { - return; - } - - if ( key !== undefined ) { - - // Support array or space separated string of keys - if ( Array.isArray( key ) ) { - - // If key is an array of keys... - // We always set camelCase keys, so remove that. - key = key.map( jQuery.camelCase ); - } else { - key = jQuery.camelCase( key ); - - // If a key with the spaces exists, use it. - // Otherwise, create an array by matching non-whitespace - key = key in cache ? - [ key ] : - ( key.match( rnothtmlwhite ) || [] ); - } - - i = key.length; - - while ( i-- ) { - delete cache[ key[ i ] ]; - } - } - - // Remove the expando if there's no more data - if ( key === undefined || jQuery.isEmptyObject( cache ) ) { - - // Support: Chrome <=35 - 45 - // Webkit & Blink performance suffers when deleting properties - // from DOM nodes, so set to undefined instead - // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) - if ( owner.nodeType ) { - owner[ this.expando ] = undefined; - } else { - delete owner[ this.expando ]; - } - } - }, - hasData: function( owner ) { - var cache = owner[ this.expando ]; - return cache !== undefined && !jQuery.isEmptyObject( cache ); - } -}; -var dataPriv = new Data(); - -var dataUser = new Data(); - - - -// Implementation Summary -// -// 1. Enforce API surface and semantic compatibility with 1.9.x branch -// 2. Improve the module's maintainability by reducing the storage -// paths to a single mechanism. -// 3. Use the same single mechanism to support "private" and "user" data. -// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) -// 5. Avoid exposing implementation details on user objects (eg. expando properties) -// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 - -var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, - rmultiDash = /[A-Z]/g; - -function getData( data ) { - if ( data === "true" ) { - return true; - } - - if ( data === "false" ) { - return false; - } - - if ( data === "null" ) { - return null; - } - - // Only convert to a number if it doesn't change the string - if ( data === +data + "" ) { - return +data; - } - - if ( rbrace.test( data ) ) { - return JSON.parse( data ); - } - - return data; -} - -function dataAttr( elem, key, data ) { - var name; - - // If nothing was found internally, try to fetch any - // data from the HTML5 data-* attribute - if ( data === undefined && elem.nodeType === 1 ) { - name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); - data = elem.getAttribute( name ); - - if ( typeof data === "string" ) { - try { - data = getData( data ); - } catch ( e ) {} - - // Make sure we set the data so it isn't changed later - dataUser.set( elem, key, data ); - } else { - data = undefined; - } - } - return data; -} - -jQuery.extend( { - hasData: function( elem ) { - return dataUser.hasData( elem ) || dataPriv.hasData( elem ); - }, - - data: function( elem, name, data ) { - return dataUser.access( elem, name, data ); - }, - - removeData: function( elem, name ) { - dataUser.remove( elem, name ); - }, - - // TODO: Now that all calls to _data and _removeData have been replaced - // with direct calls to dataPriv methods, these can be deprecated. - _data: function( elem, name, data ) { - return dataPriv.access( elem, name, data ); - }, - - _removeData: function( elem, name ) { - dataPriv.remove( elem, name ); - } -} ); - -jQuery.fn.extend( { - data: function( key, value ) { - var i, name, data, - elem = this[ 0 ], - attrs = elem && elem.attributes; - - // Gets all values - if ( key === undefined ) { - if ( this.length ) { - data = dataUser.get( elem ); - - if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { - i = attrs.length; - while ( i-- ) { - - // Support: IE 11 only - // The attrs elements can be null (#14894) - if ( attrs[ i ] ) { - name = attrs[ i ].name; - if ( name.indexOf( "data-" ) === 0 ) { - name = jQuery.camelCase( name.slice( 5 ) ); - dataAttr( elem, name, data[ name ] ); - } - } - } - dataPriv.set( elem, "hasDataAttrs", true ); - } - } - - return data; - } - - // Sets multiple values - if ( typeof key === "object" ) { - return this.each( function() { - dataUser.set( this, key ); - } ); - } - - return access( this, function( value ) { - var data; - - // The calling jQuery object (element matches) is not empty - // (and therefore has an element appears at this[ 0 ]) and the - // `value` parameter was not undefined. An empty jQuery object - // will result in `undefined` for elem = this[ 0 ] which will - // throw an exception if an attempt to read a data cache is made. - if ( elem && value === undefined ) { - - // Attempt to get data from the cache - // The key will always be camelCased in Data - data = dataUser.get( elem, key ); - if ( data !== undefined ) { - return data; - } - - // Attempt to "discover" the data in - // HTML5 custom data-* attrs - data = dataAttr( elem, key ); - if ( data !== undefined ) { - return data; - } - - // We tried really hard, but the data doesn't exist. - return; - } - - // Set the data... - this.each( function() { - - // We always store the camelCased key - dataUser.set( this, key, value ); - } ); - }, null, value, arguments.length > 1, null, true ); - }, - - removeData: function( key ) { - return this.each( function() { - dataUser.remove( this, key ); - } ); - } -} ); - - -jQuery.extend( { - queue: function( elem, type, data ) { - var queue; - - if ( elem ) { - type = ( type || "fx" ) + "queue"; - queue = dataPriv.get( elem, type ); - - // Speed up dequeue by getting out quickly if this is just a lookup - if ( data ) { - if ( !queue || Array.isArray( data ) ) { - queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); - } else { - queue.push( data ); - } - } - return queue || []; - } - }, - - dequeue: function( elem, type ) { - type = type || "fx"; - - var queue = jQuery.queue( elem, type ), - startLength = queue.length, - fn = queue.shift(), - hooks = jQuery._queueHooks( elem, type ), - next = function() { - jQuery.dequeue( elem, type ); - }; - - // If the fx queue is dequeued, always remove the progress sentinel - if ( fn === "inprogress" ) { - fn = queue.shift(); - startLength--; - } - - if ( fn ) { - - // Add a progress sentinel to prevent the fx queue from being - // automatically dequeued - if ( type === "fx" ) { - queue.unshift( "inprogress" ); - } - - // Clear up the last queue stop function - delete hooks.stop; - fn.call( elem, next, hooks ); - } - - if ( !startLength && hooks ) { - hooks.empty.fire(); - } - }, - - // Not public - generate a queueHooks object, or return the current one - _queueHooks: function( elem, type ) { - var key = type + "queueHooks"; - return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { - empty: jQuery.Callbacks( "once memory" ).add( function() { - dataPriv.remove( elem, [ type + "queue", key ] ); - } ) - } ); - } -} ); - -jQuery.fn.extend( { - queue: function( type, data ) { - var setter = 2; - - if ( typeof type !== "string" ) { - data = type; - type = "fx"; - setter--; - } - - if ( arguments.length < setter ) { - return jQuery.queue( this[ 0 ], type ); - } - - return data === undefined ? - this : - this.each( function() { - var queue = jQuery.queue( this, type, data ); - - // Ensure a hooks for this queue - jQuery._queueHooks( this, type ); - - if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { - jQuery.dequeue( this, type ); - } - } ); - }, - dequeue: function( type ) { - return this.each( function() { - jQuery.dequeue( this, type ); - } ); - }, - clearQueue: function( type ) { - return this.queue( type || "fx", [] ); - }, - - // Get a promise resolved when queues of a certain type - // are emptied (fx is the type by default) - promise: function( type, obj ) { - var tmp, - count = 1, - defer = jQuery.Deferred(), - elements = this, - i = this.length, - resolve = function() { - if ( !( --count ) ) { - defer.resolveWith( elements, [ elements ] ); - } - }; - - if ( typeof type !== "string" ) { - obj = type; - type = undefined; - } - type = type || "fx"; - - while ( i-- ) { - tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); - if ( tmp && tmp.empty ) { - count++; - tmp.empty.add( resolve ); - } - } - resolve(); - return defer.promise( obj ); - } -} ); -var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; - -var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); - - -var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; - -var isHiddenWithinTree = function( elem, el ) { - - // isHiddenWithinTree might be called from jQuery#filter function; - // in that case, element will be second argument - elem = el || elem; - - // Inline style trumps all - return elem.style.display === "none" || - elem.style.display === "" && - - // Otherwise, check computed style - // Support: Firefox <=43 - 45 - // Disconnected elements can have computed display: none, so first confirm that elem is - // in the document. - jQuery.contains( elem.ownerDocument, elem ) && - - jQuery.css( elem, "display" ) === "none"; - }; - -var swap = function( elem, options, callback, args ) { - var ret, name, - old = {}; - - // Remember the old values, and insert the new ones - for ( name in options ) { - old[ name ] = elem.style[ name ]; - elem.style[ name ] = options[ name ]; - } - - ret = callback.apply( elem, args || [] ); - - // Revert the old values - for ( name in options ) { - elem.style[ name ] = old[ name ]; - } - - return ret; -}; - - - - -function adjustCSS( elem, prop, valueParts, tween ) { - var adjusted, - scale = 1, - maxIterations = 20, - currentValue = tween ? - function() { - return tween.cur(); - } : - function() { - return jQuery.css( elem, prop, "" ); - }, - initial = currentValue(), - unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), - - // Starting value computation is required for potential unit mismatches - initialInUnit = ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && - rcssNum.exec( jQuery.css( elem, prop ) ); - - if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { - - // Trust units reported by jQuery.css - unit = unit || initialInUnit[ 3 ]; - - // Make sure we update the tween properties later on - valueParts = valueParts || []; - - // Iteratively approximate from a nonzero starting point - initialInUnit = +initial || 1; - - do { - - // If previous iteration zeroed out, double until we get *something*. - // Use string for doubling so we don't accidentally see scale as unchanged below - scale = scale || ".5"; - - // Adjust and apply - initialInUnit = initialInUnit / scale; - jQuery.style( elem, prop, initialInUnit + unit ); - - // Update scale, tolerating zero or NaN from tween.cur() - // Break the loop if scale is unchanged or perfect, or if we've just had enough. - } while ( - scale !== ( scale = currentValue() / initial ) && scale !== 1 && --maxIterations - ); - } - - if ( valueParts ) { - initialInUnit = +initialInUnit || +initial || 0; - - // Apply relative offset (+=/-=) if specified - adjusted = valueParts[ 1 ] ? - initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : - +valueParts[ 2 ]; - if ( tween ) { - tween.unit = unit; - tween.start = initialInUnit; - tween.end = adjusted; - } - } - return adjusted; -} - - -var defaultDisplayMap = {}; - -function getDefaultDisplay( elem ) { - var temp, - doc = elem.ownerDocument, - nodeName = elem.nodeName, - display = defaultDisplayMap[ nodeName ]; - - if ( display ) { - return display; - } - - temp = doc.body.appendChild( doc.createElement( nodeName ) ); - display = jQuery.css( temp, "display" ); - - temp.parentNode.removeChild( temp ); - - if ( display === "none" ) { - display = "block"; - } - defaultDisplayMap[ nodeName ] = display; - - return display; -} - -function showHide( elements, show ) { - var display, elem, - values = [], - index = 0, - length = elements.length; - - // Determine new display value for elements that need to change - for ( ; index < length; index++ ) { - elem = elements[ index ]; - if ( !elem.style ) { - continue; - } - - display = elem.style.display; - if ( show ) { - - // Since we force visibility upon cascade-hidden elements, an immediate (and slow) - // check is required in this first loop unless we have a nonempty display value (either - // inline or about-to-be-restored) - if ( display === "none" ) { - values[ index ] = dataPriv.get( elem, "display" ) || null; - if ( !values[ index ] ) { - elem.style.display = ""; - } - } - if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { - values[ index ] = getDefaultDisplay( elem ); - } - } else { - if ( display !== "none" ) { - values[ index ] = "none"; - - // Remember what we're overwriting - dataPriv.set( elem, "display", display ); - } - } - } - - // Set the display of the elements in a second loop to avoid constant reflow - for ( index = 0; index < length; index++ ) { - if ( values[ index ] != null ) { - elements[ index ].style.display = values[ index ]; - } - } - - return elements; -} - -jQuery.fn.extend( { - show: function() { - return showHide( this, true ); - }, - hide: function() { - return showHide( this ); - }, - toggle: function( state ) { - if ( typeof state === "boolean" ) { - return state ? this.show() : this.hide(); - } - - return this.each( function() { - if ( isHiddenWithinTree( this ) ) { - jQuery( this ).show(); - } else { - jQuery( this ).hide(); - } - } ); - } -} ); -var rcheckableType = ( /^(?:checkbox|radio)$/i ); - -var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]+)/i ); - -var rscriptType = ( /^$|\/(?:java|ecma)script/i ); - - - -// We have to close these tags to support XHTML (#13200) -var wrapMap = { - - // Support: IE <=9 only - option: [ 1, "" ], - - // XHTML parsers do not magically insert elements in the - // same way that tag soup parsers do. So we cannot shorten - // this by omitting or other required elements. - thead: [ 1, "", "
" ], - col: [ 2, "", "
" ], - tr: [ 2, "", "
" ], - td: [ 3, "", "
" ], - - _default: [ 0, "", "" ] -}; - -// Support: IE <=9 only -wrapMap.optgroup = wrapMap.option; - -wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; -wrapMap.th = wrapMap.td; - - -function getAll( context, tag ) { - - // Support: IE <=9 - 11 only - // Use typeof to avoid zero-argument method invocation on host objects (#15151) - var ret; - - if ( typeof context.getElementsByTagName !== "undefined" ) { - ret = context.getElementsByTagName( tag || "*" ); - - } else if ( typeof context.querySelectorAll !== "undefined" ) { - ret = context.querySelectorAll( tag || "*" ); - - } else { - ret = []; - } - - if ( tag === undefined || tag && nodeName( context, tag ) ) { - return jQuery.merge( [ context ], ret ); - } - - return ret; -} - - -// Mark scripts as having already been evaluated -function setGlobalEval( elems, refElements ) { - var i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - dataPriv.set( - elems[ i ], - "globalEval", - !refElements || dataPriv.get( refElements[ i ], "globalEval" ) - ); - } -} - - -var rhtml = /<|&#?\w+;/; - -function buildFragment( elems, context, scripts, selection, ignored ) { - var elem, tmp, tag, wrap, contains, j, - fragment = context.createDocumentFragment(), - nodes = [], - i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - elem = elems[ i ]; - - if ( elem || elem === 0 ) { - - // Add nodes directly - if ( jQuery.type( elem ) === "object" ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); - - // Convert non-html into a text node - } else if ( !rhtml.test( elem ) ) { - nodes.push( context.createTextNode( elem ) ); - - // Convert html into DOM nodes - } else { - tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); - - // Deserialize a standard representation - tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); - wrap = wrapMap[ tag ] || wrapMap._default; - tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; - - // Descend through wrappers to the right content - j = wrap[ 0 ]; - while ( j-- ) { - tmp = tmp.lastChild; - } - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, tmp.childNodes ); - - // Remember the top-level container - tmp = fragment.firstChild; - - // Ensure the created nodes are orphaned (#12392) - tmp.textContent = ""; - } - } - } - - // Remove wrapper from fragment - fragment.textContent = ""; - - i = 0; - while ( ( elem = nodes[ i++ ] ) ) { - - // Skip elements already in the context collection (trac-4087) - if ( selection && jQuery.inArray( elem, selection ) > -1 ) { - if ( ignored ) { - ignored.push( elem ); - } - continue; - } - - contains = jQuery.contains( elem.ownerDocument, elem ); - - // Append to fragment - tmp = getAll( fragment.appendChild( elem ), "script" ); - - // Preserve script evaluation history - if ( contains ) { - setGlobalEval( tmp ); - } - - // Capture executables - if ( scripts ) { - j = 0; - while ( ( elem = tmp[ j++ ] ) ) { - if ( rscriptType.test( elem.type || "" ) ) { - scripts.push( elem ); - } - } - } - } - - return fragment; -} - - -( function() { - var fragment = document.createDocumentFragment(), - div = fragment.appendChild( document.createElement( "div" ) ), - input = document.createElement( "input" ); - - // Support: Android 4.0 - 4.3 only - // Check state lost if the name is set (#11217) - // Support: Windows Web Apps (WWA) - // `name` and `type` must use .setAttribute for WWA (#14901) - input.setAttribute( "type", "radio" ); - input.setAttribute( "checked", "checked" ); - input.setAttribute( "name", "t" ); - - div.appendChild( input ); - - // Support: Android <=4.1 only - // Older WebKit doesn't clone checked state correctly in fragments - support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; - - // Support: IE <=11 only - // Make sure textarea (and checkbox) defaultValue is properly cloned - div.innerHTML = ""; - support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; -} )(); -var documentElement = document.documentElement; - - - -var - rkeyEvent = /^key/, - rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, - rtypenamespace = /^([^.]*)(?:\.(.+)|)/; - -function returnTrue() { - return true; -} - -function returnFalse() { - return false; -} - -// Support: IE <=9 only -// See #13393 for more info -function safeActiveElement() { - try { - return document.activeElement; - } catch ( err ) { } -} - -function on( elem, types, selector, data, fn, one ) { - var origFn, type; - - // Types can be a map of types/handlers - if ( typeof types === "object" ) { - - // ( types-Object, selector, data ) - if ( typeof selector !== "string" ) { - - // ( types-Object, data ) - data = data || selector; - selector = undefined; - } - for ( type in types ) { - on( elem, type, selector, data, types[ type ], one ); - } - return elem; - } - - if ( data == null && fn == null ) { - - // ( types, fn ) - fn = selector; - data = selector = undefined; - } else if ( fn == null ) { - if ( typeof selector === "string" ) { - - // ( types, selector, fn ) - fn = data; - data = undefined; - } else { - - // ( types, data, fn ) - fn = data; - data = selector; - selector = undefined; - } - } - if ( fn === false ) { - fn = returnFalse; - } else if ( !fn ) { - return elem; - } - - if ( one === 1 ) { - origFn = fn; - fn = function( event ) { - - // Can use an empty set, since event contains the info - jQuery().off( event ); - return origFn.apply( this, arguments ); - }; - - // Use same guid so caller can remove using origFn - fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); - } - return elem.each( function() { - jQuery.event.add( this, types, fn, data, selector ); - } ); -} - -/* - * Helper functions for managing events -- not part of the public interface. - * Props to Dean Edwards' addEvent library for many of the ideas. - */ -jQuery.event = { - - global: {}, - - add: function( elem, types, handler, data, selector ) { - - var handleObjIn, eventHandle, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.get( elem ); - - // Don't attach events to noData or text/comment nodes (but allow plain objects) - if ( !elemData ) { - return; - } - - // Caller can pass in an object of custom data in lieu of the handler - if ( handler.handler ) { - handleObjIn = handler; - handler = handleObjIn.handler; - selector = handleObjIn.selector; - } - - // Ensure that invalid selectors throw exceptions at attach time - // Evaluate against documentElement in case elem is a non-element node (e.g., document) - if ( selector ) { - jQuery.find.matchesSelector( documentElement, selector ); - } - - // Make sure that the handler has a unique ID, used to find/remove it later - if ( !handler.guid ) { - handler.guid = jQuery.guid++; - } - - // Init the element's event structure and main handler, if this is the first - if ( !( events = elemData.events ) ) { - events = elemData.events = {}; - } - if ( !( eventHandle = elemData.handle ) ) { - eventHandle = elemData.handle = function( e ) { - - // Discard the second event of a jQuery.event.trigger() and - // when an event is called after a page has unloaded - return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? - jQuery.event.dispatch.apply( elem, arguments ) : undefined; - }; - } - - // Handle multiple events separated by a space - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // There *must* be a type, no attaching namespace-only handlers - if ( !type ) { - continue; - } - - // If event changes its type, use the special event handlers for the changed type - special = jQuery.event.special[ type ] || {}; - - // If selector defined, determine special event api type, otherwise given type - type = ( selector ? special.delegateType : special.bindType ) || type; - - // Update special based on newly reset type - special = jQuery.event.special[ type ] || {}; - - // handleObj is passed to all event handlers - handleObj = jQuery.extend( { - type: type, - origType: origType, - data: data, - handler: handler, - guid: handler.guid, - selector: selector, - needsContext: selector && jQuery.expr.match.needsContext.test( selector ), - namespace: namespaces.join( "." ) - }, handleObjIn ); - - // Init the event handler queue if we're the first - if ( !( handlers = events[ type ] ) ) { - handlers = events[ type ] = []; - handlers.delegateCount = 0; - - // Only use addEventListener if the special events handler returns false - if ( !special.setup || - special.setup.call( elem, data, namespaces, eventHandle ) === false ) { - - if ( elem.addEventListener ) { - elem.addEventListener( type, eventHandle ); - } - } - } - - if ( special.add ) { - special.add.call( elem, handleObj ); - - if ( !handleObj.handler.guid ) { - handleObj.handler.guid = handler.guid; - } - } - - // Add to the element's handler list, delegates in front - if ( selector ) { - handlers.splice( handlers.delegateCount++, 0, handleObj ); - } else { - handlers.push( handleObj ); - } - - // Keep track of which events have ever been used, for event optimization - jQuery.event.global[ type ] = true; - } - - }, - - // Detach an event or set of events from an element - remove: function( elem, types, handler, selector, mappedTypes ) { - - var j, origCount, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); - - if ( !elemData || !( events = elemData.events ) ) { - return; - } - - // Once for each type.namespace in types; type may be omitted - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // Unbind all events (on this namespace, if provided) for the element - if ( !type ) { - for ( type in events ) { - jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); - } - continue; - } - - special = jQuery.event.special[ type ] || {}; - type = ( selector ? special.delegateType : special.bindType ) || type; - handlers = events[ type ] || []; - tmp = tmp[ 2 ] && - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); - - // Remove matching events - origCount = j = handlers.length; - while ( j-- ) { - handleObj = handlers[ j ]; - - if ( ( mappedTypes || origType === handleObj.origType ) && - ( !handler || handler.guid === handleObj.guid ) && - ( !tmp || tmp.test( handleObj.namespace ) ) && - ( !selector || selector === handleObj.selector || - selector === "**" && handleObj.selector ) ) { - handlers.splice( j, 1 ); - - if ( handleObj.selector ) { - handlers.delegateCount--; - } - if ( special.remove ) { - special.remove.call( elem, handleObj ); - } - } - } - - // Remove generic event handler if we removed something and no more handlers exist - // (avoids potential for endless recursion during removal of special event handlers) - if ( origCount && !handlers.length ) { - if ( !special.teardown || - special.teardown.call( elem, namespaces, elemData.handle ) === false ) { - - jQuery.removeEvent( elem, type, elemData.handle ); - } - - delete events[ type ]; - } - } - - // Remove data and the expando if it's no longer used - if ( jQuery.isEmptyObject( events ) ) { - dataPriv.remove( elem, "handle events" ); - } - }, - - dispatch: function( nativeEvent ) { - - // Make a writable jQuery.Event from the native event object - var event = jQuery.event.fix( nativeEvent ); - - var i, j, ret, matched, handleObj, handlerQueue, - args = new Array( arguments.length ), - handlers = ( dataPriv.get( this, "events" ) || {} )[ event.type ] || [], - special = jQuery.event.special[ event.type ] || {}; - - // Use the fix-ed jQuery.Event rather than the (read-only) native event - args[ 0 ] = event; - - for ( i = 1; i < arguments.length; i++ ) { - args[ i ] = arguments[ i ]; - } - - event.delegateTarget = this; - - // Call the preDispatch hook for the mapped type, and let it bail if desired - if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { - return; - } - - // Determine handlers - handlerQueue = jQuery.event.handlers.call( this, event, handlers ); - - // Run delegates first; they may want to stop propagation beneath us - i = 0; - while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { - event.currentTarget = matched.elem; - - j = 0; - while ( ( handleObj = matched.handlers[ j++ ] ) && - !event.isImmediatePropagationStopped() ) { - - // Triggered event must either 1) have no namespace, or 2) have namespace(s) - // a subset or equal to those in the bound event (both can have no namespace). - if ( !event.rnamespace || event.rnamespace.test( handleObj.namespace ) ) { - - event.handleObj = handleObj; - event.data = handleObj.data; - - ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || - handleObj.handler ).apply( matched.elem, args ); - - if ( ret !== undefined ) { - if ( ( event.result = ret ) === false ) { - event.preventDefault(); - event.stopPropagation(); - } - } - } - } - } - - // Call the postDispatch hook for the mapped type - if ( special.postDispatch ) { - special.postDispatch.call( this, event ); - } - - return event.result; - }, - - handlers: function( event, handlers ) { - var i, handleObj, sel, matchedHandlers, matchedSelectors, - handlerQueue = [], - delegateCount = handlers.delegateCount, - cur = event.target; - - // Find delegate handlers - if ( delegateCount && - - // Support: IE <=9 - // Black-hole SVG instance trees (trac-13180) - cur.nodeType && - - // Support: Firefox <=42 - // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) - // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click - // Support: IE 11 only - // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) - !( event.type === "click" && event.button >= 1 ) ) { - - for ( ; cur !== this; cur = cur.parentNode || this ) { - - // Don't check non-elements (#13208) - // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) - if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { - matchedHandlers = []; - matchedSelectors = {}; - for ( i = 0; i < delegateCount; i++ ) { - handleObj = handlers[ i ]; - - // Don't conflict with Object.prototype properties (#13203) - sel = handleObj.selector + " "; - - if ( matchedSelectors[ sel ] === undefined ) { - matchedSelectors[ sel ] = handleObj.needsContext ? - jQuery( sel, this ).index( cur ) > -1 : - jQuery.find( sel, this, null, [ cur ] ).length; - } - if ( matchedSelectors[ sel ] ) { - matchedHandlers.push( handleObj ); - } - } - if ( matchedHandlers.length ) { - handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); - } - } - } - } - - // Add the remaining (directly-bound) handlers - cur = this; - if ( delegateCount < handlers.length ) { - handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); - } - - return handlerQueue; - }, - - addProp: function( name, hook ) { - Object.defineProperty( jQuery.Event.prototype, name, { - enumerable: true, - configurable: true, - - get: jQuery.isFunction( hook ) ? - function() { - if ( this.originalEvent ) { - return hook( this.originalEvent ); - } - } : - function() { - if ( this.originalEvent ) { - return this.originalEvent[ name ]; - } - }, - - set: function( value ) { - Object.defineProperty( this, name, { - enumerable: true, - configurable: true, - writable: true, - value: value - } ); - } - } ); - }, - - fix: function( originalEvent ) { - return originalEvent[ jQuery.expando ] ? - originalEvent : - new jQuery.Event( originalEvent ); - }, - - special: { - load: { - - // Prevent triggered image.load events from bubbling to window.load - noBubble: true - }, - focus: { - - // Fire native event if possible so blur/focus sequence is correct - trigger: function() { - if ( this !== safeActiveElement() && this.focus ) { - this.focus(); - return false; - } - }, - delegateType: "focusin" - }, - blur: { - trigger: function() { - if ( this === safeActiveElement() && this.blur ) { - this.blur(); - return false; - } - }, - delegateType: "focusout" - }, - click: { - - // For checkbox, fire native event so checked state will be right - trigger: function() { - if ( this.type === "checkbox" && this.click && nodeName( this, "input" ) ) { - this.click(); - return false; - } - }, - - // For cross-browser consistency, don't fire native .click() on links - _default: function( event ) { - return nodeName( event.target, "a" ); - } - }, - - beforeunload: { - postDispatch: function( event ) { - - // Support: Firefox 20+ - // Firefox doesn't alert if the returnValue field is not set. - if ( event.result !== undefined && event.originalEvent ) { - event.originalEvent.returnValue = event.result; - } - } - } - } -}; - -jQuery.removeEvent = function( elem, type, handle ) { - - // This "if" is needed for plain objects - if ( elem.removeEventListener ) { - elem.removeEventListener( type, handle ); - } -}; - -jQuery.Event = function( src, props ) { - - // Allow instantiation without the 'new' keyword - if ( !( this instanceof jQuery.Event ) ) { - return new jQuery.Event( src, props ); - } - - // Event object - if ( src && src.type ) { - this.originalEvent = src; - this.type = src.type; - - // Events bubbling up the document may have been marked as prevented - // by a handler lower down the tree; reflect the correct value. - this.isDefaultPrevented = src.defaultPrevented || - src.defaultPrevented === undefined && - - // Support: Android <=2.3 only - src.returnValue === false ? - returnTrue : - returnFalse; - - // Create target properties - // Support: Safari <=6 - 7 only - // Target should not be a text node (#504, #13143) - this.target = ( src.target && src.target.nodeType === 3 ) ? - src.target.parentNode : - src.target; - - this.currentTarget = src.currentTarget; - this.relatedTarget = src.relatedTarget; - - // Event type - } else { - this.type = src; - } - - // Put explicitly provided properties onto the event object - if ( props ) { - jQuery.extend( this, props ); - } - - // Create a timestamp if incoming event doesn't have one - this.timeStamp = src && src.timeStamp || jQuery.now(); - - // Mark it as fixed - this[ jQuery.expando ] = true; -}; - -// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding -// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html -jQuery.Event.prototype = { - constructor: jQuery.Event, - isDefaultPrevented: returnFalse, - isPropagationStopped: returnFalse, - isImmediatePropagationStopped: returnFalse, - isSimulated: false, - - preventDefault: function() { - var e = this.originalEvent; - - this.isDefaultPrevented = returnTrue; - - if ( e && !this.isSimulated ) { - e.preventDefault(); - } - }, - stopPropagation: function() { - var e = this.originalEvent; - - this.isPropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopPropagation(); - } - }, - stopImmediatePropagation: function() { - var e = this.originalEvent; - - this.isImmediatePropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopImmediatePropagation(); - } - - this.stopPropagation(); - } -}; - -// Includes all common event props including KeyEvent and MouseEvent specific props -jQuery.each( { - altKey: true, - bubbles: true, - cancelable: true, - changedTouches: true, - ctrlKey: true, - detail: true, - eventPhase: true, - metaKey: true, - pageX: true, - pageY: true, - shiftKey: true, - view: true, - "char": true, - charCode: true, - key: true, - keyCode: true, - button: true, - buttons: true, - clientX: true, - clientY: true, - offsetX: true, - offsetY: true, - pointerId: true, - pointerType: true, - screenX: true, - screenY: true, - targetTouches: true, - toElement: true, - touches: true, - - which: function( event ) { - var button = event.button; - - // Add which for key events - if ( event.which == null && rkeyEvent.test( event.type ) ) { - return event.charCode != null ? event.charCode : event.keyCode; - } - - // Add which for click: 1 === left; 2 === middle; 3 === right - if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { - if ( button & 1 ) { - return 1; - } - - if ( button & 2 ) { - return 3; - } - - if ( button & 4 ) { - return 2; - } - - return 0; - } - - return event.which; - } -}, jQuery.event.addProp ); - -// Create mouseenter/leave events using mouseover/out and event-time checks -// so that event delegation works in jQuery. -// Do the same for pointerenter/pointerleave and pointerover/pointerout -// -// Support: Safari 7 only -// Safari sends mouseenter too often; see: -// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 -// for the description of the bug (it existed in older Chrome versions as well). -jQuery.each( { - mouseenter: "mouseover", - mouseleave: "mouseout", - pointerenter: "pointerover", - pointerleave: "pointerout" -}, function( orig, fix ) { - jQuery.event.special[ orig ] = { - delegateType: fix, - bindType: fix, - - handle: function( event ) { - var ret, - target = this, - related = event.relatedTarget, - handleObj = event.handleObj; - - // For mouseenter/leave call the handler if related is outside the target. - // NB: No relatedTarget if the mouse left/entered the browser window - if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { - event.type = handleObj.origType; - ret = handleObj.handler.apply( this, arguments ); - event.type = fix; - } - return ret; - } - }; -} ); - -jQuery.fn.extend( { - - on: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn ); - }, - one: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn, 1 ); - }, - off: function( types, selector, fn ) { - var handleObj, type; - if ( types && types.preventDefault && types.handleObj ) { - - // ( event ) dispatched jQuery.Event - handleObj = types.handleObj; - jQuery( types.delegateTarget ).off( - handleObj.namespace ? - handleObj.origType + "." + handleObj.namespace : - handleObj.origType, - handleObj.selector, - handleObj.handler - ); - return this; - } - if ( typeof types === "object" ) { - - // ( types-object [, selector] ) - for ( type in types ) { - this.off( type, selector, types[ type ] ); - } - return this; - } - if ( selector === false || typeof selector === "function" ) { - - // ( types [, fn] ) - fn = selector; - selector = undefined; - } - if ( fn === false ) { - fn = returnFalse; - } - return this.each( function() { - jQuery.event.remove( this, types, fn, selector ); - } ); - } -} ); - - -var - - /* eslint-disable max-len */ - - // See https://github.com/eslint/eslint/issues/3229 - rxhtmlTag = /<(?!area|br|col|embed|hr|img|input|link|meta|param)(([a-z][^\/\0>\x20\t\r\n\f]*)[^>]*)\/>/gi, - - /* eslint-enable */ - - // Support: IE <=10 - 11, Edge 12 - 13 - // In IE/Edge using regex groups here causes severe slowdowns. - // See https://connect.microsoft.com/IE/feedback/details/1736512/ - rnoInnerhtml = /\s*$/g; - -// Prefer a tbody over its parent table for containing new rows -function manipulationTarget( elem, content ) { - if ( nodeName( elem, "table" ) && - nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { - - return jQuery( ">tbody", elem )[ 0 ] || elem; - } - - return elem; -} - -// Replace/restore the type attribute of script elements for safe DOM manipulation -function disableScript( elem ) { - elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; - return elem; -} -function restoreScript( elem ) { - var match = rscriptTypeMasked.exec( elem.type ); - - if ( match ) { - elem.type = match[ 1 ]; - } else { - elem.removeAttribute( "type" ); - } - - return elem; -} - -function cloneCopyEvent( src, dest ) { - var i, l, type, pdataOld, pdataCur, udataOld, udataCur, events; - - if ( dest.nodeType !== 1 ) { - return; - } - - // 1. Copy private data: events, handlers, etc. - if ( dataPriv.hasData( src ) ) { - pdataOld = dataPriv.access( src ); - pdataCur = dataPriv.set( dest, pdataOld ); - events = pdataOld.events; - - if ( events ) { - delete pdataCur.handle; - pdataCur.events = {}; - - for ( type in events ) { - for ( i = 0, l = events[ type ].length; i < l; i++ ) { - jQuery.event.add( dest, type, events[ type ][ i ] ); - } - } - } - } - - // 2. Copy user data - if ( dataUser.hasData( src ) ) { - udataOld = dataUser.access( src ); - udataCur = jQuery.extend( {}, udataOld ); - - dataUser.set( dest, udataCur ); - } -} - -// Fix IE bugs, see support tests -function fixInput( src, dest ) { - var nodeName = dest.nodeName.toLowerCase(); - - // Fails to persist the checked state of a cloned checkbox or radio button. - if ( nodeName === "input" && rcheckableType.test( src.type ) ) { - dest.checked = src.checked; - - // Fails to return the selected option to the default selected state when cloning options - } else if ( nodeName === "input" || nodeName === "textarea" ) { - dest.defaultValue = src.defaultValue; - } -} - -function domManip( collection, args, callback, ignored ) { - - // Flatten any nested arrays - args = concat.apply( [], args ); - - var fragment, first, scripts, hasScripts, node, doc, - i = 0, - l = collection.length, - iNoClone = l - 1, - value = args[ 0 ], - isFunction = jQuery.isFunction( value ); - - // We can't cloneNode fragments that contain checked, in WebKit - if ( isFunction || - ( l > 1 && typeof value === "string" && - !support.checkClone && rchecked.test( value ) ) ) { - return collection.each( function( index ) { - var self = collection.eq( index ); - if ( isFunction ) { - args[ 0 ] = value.call( this, index, self.html() ); - } - domManip( self, args, callback, ignored ); - } ); - } - - if ( l ) { - fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); - first = fragment.firstChild; - - if ( fragment.childNodes.length === 1 ) { - fragment = first; - } - - // Require either new content or an interest in ignored elements to invoke the callback - if ( first || ignored ) { - scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); - hasScripts = scripts.length; - - // Use the original fragment for the last item - // instead of the first because it can end up - // being emptied incorrectly in certain situations (#8070). - for ( ; i < l; i++ ) { - node = fragment; - - if ( i !== iNoClone ) { - node = jQuery.clone( node, true, true ); - - // Keep references to cloned scripts for later restoration - if ( hasScripts ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( scripts, getAll( node, "script" ) ); - } - } - - callback.call( collection[ i ], node, i ); - } - - if ( hasScripts ) { - doc = scripts[ scripts.length - 1 ].ownerDocument; - - // Reenable scripts - jQuery.map( scripts, restoreScript ); - - // Evaluate executable scripts on first document insertion - for ( i = 0; i < hasScripts; i++ ) { - node = scripts[ i ]; - if ( rscriptType.test( node.type || "" ) && - !dataPriv.access( node, "globalEval" ) && - jQuery.contains( doc, node ) ) { - - if ( node.src ) { - - // Optional AJAX dependency, but won't run scripts if not present - if ( jQuery._evalUrl ) { - jQuery._evalUrl( node.src ); - } - } else { - DOMEval( node.textContent.replace( rcleanScript, "" ), doc ); - } - } - } - } - } - } - - return collection; -} - -function remove( elem, selector, keepData ) { - var node, - nodes = selector ? jQuery.filter( selector, elem ) : elem, - i = 0; - - for ( ; ( node = nodes[ i ] ) != null; i++ ) { - if ( !keepData && node.nodeType === 1 ) { - jQuery.cleanData( getAll( node ) ); - } - - if ( node.parentNode ) { - if ( keepData && jQuery.contains( node.ownerDocument, node ) ) { - setGlobalEval( getAll( node, "script" ) ); - } - node.parentNode.removeChild( node ); - } - } - - return elem; -} - -jQuery.extend( { - htmlPrefilter: function( html ) { - return html.replace( rxhtmlTag, "<$1>" ); - }, - - clone: function( elem, dataAndEvents, deepDataAndEvents ) { - var i, l, srcElements, destElements, - clone = elem.cloneNode( true ), - inPage = jQuery.contains( elem.ownerDocument, elem ); - - // Fix IE cloning issues - if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && - !jQuery.isXMLDoc( elem ) ) { - - // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 - destElements = getAll( clone ); - srcElements = getAll( elem ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - fixInput( srcElements[ i ], destElements[ i ] ); - } - } - - // Copy the events from the original to the clone - if ( dataAndEvents ) { - if ( deepDataAndEvents ) { - srcElements = srcElements || getAll( elem ); - destElements = destElements || getAll( clone ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - cloneCopyEvent( srcElements[ i ], destElements[ i ] ); - } - } else { - cloneCopyEvent( elem, clone ); - } - } - - // Preserve script evaluation history - destElements = getAll( clone, "script" ); - if ( destElements.length > 0 ) { - setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); - } - - // Return the cloned set - return clone; - }, - - cleanData: function( elems ) { - var data, elem, type, - special = jQuery.event.special, - i = 0; - - for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { - if ( acceptData( elem ) ) { - if ( ( data = elem[ dataPriv.expando ] ) ) { - if ( data.events ) { - for ( type in data.events ) { - if ( special[ type ] ) { - jQuery.event.remove( elem, type ); - - // This is a shortcut to avoid jQuery.event.remove's overhead - } else { - jQuery.removeEvent( elem, type, data.handle ); - } - } - } - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataPriv.expando ] = undefined; - } - if ( elem[ dataUser.expando ] ) { - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataUser.expando ] = undefined; - } - } - } - } -} ); - -jQuery.fn.extend( { - detach: function( selector ) { - return remove( this, selector, true ); - }, - - remove: function( selector ) { - return remove( this, selector ); - }, - - text: function( value ) { - return access( this, function( value ) { - return value === undefined ? - jQuery.text( this ) : - this.empty().each( function() { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - this.textContent = value; - } - } ); - }, null, value, arguments.length ); - }, - - append: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.appendChild( elem ); - } - } ); - }, - - prepend: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.insertBefore( elem, target.firstChild ); - } - } ); - }, - - before: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this ); - } - } ); - }, - - after: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this.nextSibling ); - } - } ); - }, - - empty: function() { - var elem, - i = 0; - - for ( ; ( elem = this[ i ] ) != null; i++ ) { - if ( elem.nodeType === 1 ) { - - // Prevent memory leaks - jQuery.cleanData( getAll( elem, false ) ); - - // Remove any remaining nodes - elem.textContent = ""; - } - } - - return this; - }, - - clone: function( dataAndEvents, deepDataAndEvents ) { - dataAndEvents = dataAndEvents == null ? false : dataAndEvents; - deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; - - return this.map( function() { - return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); - } ); - }, - - html: function( value ) { - return access( this, function( value ) { - var elem = this[ 0 ] || {}, - i = 0, - l = this.length; - - if ( value === undefined && elem.nodeType === 1 ) { - return elem.innerHTML; - } - - // See if we can take a shortcut and just use innerHTML - if ( typeof value === "string" && !rnoInnerhtml.test( value ) && - !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { - - value = jQuery.htmlPrefilter( value ); - - try { - for ( ; i < l; i++ ) { - elem = this[ i ] || {}; - - // Remove element nodes and prevent memory leaks - if ( elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem, false ) ); - elem.innerHTML = value; - } - } - - elem = 0; - - // If using innerHTML throws an exception, use the fallback method - } catch ( e ) {} - } - - if ( elem ) { - this.empty().append( value ); - } - }, null, value, arguments.length ); - }, - - replaceWith: function() { - var ignored = []; - - // Make the changes, replacing each non-ignored context element with the new content - return domManip( this, arguments, function( elem ) { - var parent = this.parentNode; - - if ( jQuery.inArray( this, ignored ) < 0 ) { - jQuery.cleanData( getAll( this ) ); - if ( parent ) { - parent.replaceChild( elem, this ); - } - } - - // Force callback invocation - }, ignored ); - } -} ); - -jQuery.each( { - appendTo: "append", - prependTo: "prepend", - insertBefore: "before", - insertAfter: "after", - replaceAll: "replaceWith" -}, function( name, original ) { - jQuery.fn[ name ] = function( selector ) { - var elems, - ret = [], - insert = jQuery( selector ), - last = insert.length - 1, - i = 0; - - for ( ; i <= last; i++ ) { - elems = i === last ? this : this.clone( true ); - jQuery( insert[ i ] )[ original ]( elems ); - - // Support: Android <=4.0 only, PhantomJS 1 only - // .get() because push.apply(_, arraylike) throws on ancient WebKit - push.apply( ret, elems.get() ); - } - - return this.pushStack( ret ); - }; -} ); -var rmargin = ( /^margin/ ); - -var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); - -var getStyles = function( elem ) { - - // Support: IE <=11 only, Firefox <=30 (#15098, #14150) - // IE throws on elements created in popups - // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" - var view = elem.ownerDocument.defaultView; - - if ( !view || !view.opener ) { - view = window; - } - - return view.getComputedStyle( elem ); - }; - - - -( function() { - - // Executing both pixelPosition & boxSizingReliable tests require only one layout - // so they're executed at the same time to save the second computation. - function computeStyleTests() { - - // This is a singleton, we need to execute it only once - if ( !div ) { - return; - } - - div.style.cssText = - "box-sizing:border-box;" + - "position:relative;display:block;" + - "margin:auto;border:1px;padding:1px;" + - "top:1%;width:50%"; - div.innerHTML = ""; - documentElement.appendChild( container ); - - var divStyle = window.getComputedStyle( div ); - pixelPositionVal = divStyle.top !== "1%"; - - // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 - reliableMarginLeftVal = divStyle.marginLeft === "2px"; - boxSizingReliableVal = divStyle.width === "4px"; - - // Support: Android 4.0 - 4.3 only - // Some styles come back with percentage values, even though they shouldn't - div.style.marginRight = "50%"; - pixelMarginRightVal = divStyle.marginRight === "4px"; - - documentElement.removeChild( container ); - - // Nullify the div so it wouldn't be stored in the memory and - // it will also be a sign that checks already performed - div = null; - } - - var pixelPositionVal, boxSizingReliableVal, pixelMarginRightVal, reliableMarginLeftVal, - container = document.createElement( "div" ), - div = document.createElement( "div" ); - - // Finish early in limited (non-browser) environments - if ( !div.style ) { - return; - } - - // Support: IE <=9 - 11 only - // Style of cloned element affects source element cloned (#8908) - div.style.backgroundClip = "content-box"; - div.cloneNode( true ).style.backgroundClip = ""; - support.clearCloneStyle = div.style.backgroundClip === "content-box"; - - container.style.cssText = "border:0;width:8px;height:0;top:0;left:-9999px;" + - "padding:0;margin-top:1px;position:absolute"; - container.appendChild( div ); - - jQuery.extend( support, { - pixelPosition: function() { - computeStyleTests(); - return pixelPositionVal; - }, - boxSizingReliable: function() { - computeStyleTests(); - return boxSizingReliableVal; - }, - pixelMarginRight: function() { - computeStyleTests(); - return pixelMarginRightVal; - }, - reliableMarginLeft: function() { - computeStyleTests(); - return reliableMarginLeftVal; - } - } ); -} )(); - - -function curCSS( elem, name, computed ) { - var width, minWidth, maxWidth, ret, - - // Support: Firefox 51+ - // Retrieving style before computed somehow - // fixes an issue with getting wrong values - // on detached elements - style = elem.style; - - computed = computed || getStyles( elem ); - - // getPropertyValue is needed for: - // .css('filter') (IE 9 only, #12537) - // .css('--customProperty) (#3144) - if ( computed ) { - ret = computed.getPropertyValue( name ) || computed[ name ]; - - if ( ret === "" && !jQuery.contains( elem.ownerDocument, elem ) ) { - ret = jQuery.style( elem, name ); - } - - // A tribute to the "awesome hack by Dean Edwards" - // Android Browser returns percentage for some values, - // but width seems to be reliably pixels. - // This is against the CSSOM draft spec: - // https://drafts.csswg.org/cssom/#resolved-values - if ( !support.pixelMarginRight() && rnumnonpx.test( ret ) && rmargin.test( name ) ) { - - // Remember the original values - width = style.width; - minWidth = style.minWidth; - maxWidth = style.maxWidth; - - // Put in the new values to get a computed value out - style.minWidth = style.maxWidth = style.width = ret; - ret = computed.width; - - // Revert the changed values - style.width = width; - style.minWidth = minWidth; - style.maxWidth = maxWidth; - } - } - - return ret !== undefined ? - - // Support: IE <=9 - 11 only - // IE returns zIndex value as an integer. - ret + "" : - ret; -} - - -function addGetHookIf( conditionFn, hookFn ) { - - // Define the hook, we'll check on the first run if it's really needed. - return { - get: function() { - if ( conditionFn() ) { - - // Hook not needed (or it's not possible to use it due - // to missing dependency), remove it. - delete this.get; - return; - } - - // Hook needed; redefine it so that the support test is not executed again. - return ( this.get = hookFn ).apply( this, arguments ); - } - }; -} - - -var - - // Swappable if display is none or starts with table - // except "table", "table-cell", or "table-caption" - // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display - rdisplayswap = /^(none|table(?!-c[ea]).+)/, - rcustomProp = /^--/, - cssShow = { position: "absolute", visibility: "hidden", display: "block" }, - cssNormalTransform = { - letterSpacing: "0", - fontWeight: "400" - }, - - cssPrefixes = [ "Webkit", "Moz", "ms" ], - emptyStyle = document.createElement( "div" ).style; - -// Return a css property mapped to a potentially vendor prefixed property -function vendorPropName( name ) { - - // Shortcut for names that are not vendor prefixed - if ( name in emptyStyle ) { - return name; - } - - // Check for vendor prefixed names - var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), - i = cssPrefixes.length; - - while ( i-- ) { - name = cssPrefixes[ i ] + capName; - if ( name in emptyStyle ) { - return name; - } - } -} - -// Return a property mapped along what jQuery.cssProps suggests or to -// a vendor prefixed property. -function finalPropName( name ) { - var ret = jQuery.cssProps[ name ]; - if ( !ret ) { - ret = jQuery.cssProps[ name ] = vendorPropName( name ) || name; - } - return ret; -} - -function setPositiveNumber( elem, value, subtract ) { - - // Any relative (+/-) values have already been - // normalized at this point - var matches = rcssNum.exec( value ); - return matches ? - - // Guard against undefined "subtract", e.g., when used as in cssHooks - Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : - value; -} - -function augmentWidthOrHeight( elem, name, extra, isBorderBox, styles ) { - var i, - val = 0; - - // If we already have the right measurement, avoid augmentation - if ( extra === ( isBorderBox ? "border" : "content" ) ) { - i = 4; - - // Otherwise initialize for horizontal or vertical properties - } else { - i = name === "width" ? 1 : 0; - } - - for ( ; i < 4; i += 2 ) { - - // Both box models exclude margin, so add it if we want it - if ( extra === "margin" ) { - val += jQuery.css( elem, extra + cssExpand[ i ], true, styles ); - } - - if ( isBorderBox ) { - - // border-box includes padding, so remove it if we want content - if ( extra === "content" ) { - val -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - } - - // At this point, extra isn't border nor margin, so remove border - if ( extra !== "margin" ) { - val -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - } else { - - // At this point, extra isn't content, so add padding - val += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - - // At this point, extra isn't content nor padding, so add border - if ( extra !== "padding" ) { - val += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - } - } - - return val; -} - -function getWidthOrHeight( elem, name, extra ) { - - // Start with computed style - var valueIsBorderBox, - styles = getStyles( elem ), - val = curCSS( elem, name, styles ), - isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; - - // Computed unit is not pixels. Stop here and return. - if ( rnumnonpx.test( val ) ) { - return val; - } - - // Check for style in case a browser which returns unreliable values - // for getComputedStyle silently falls back to the reliable elem.style - valueIsBorderBox = isBorderBox && - ( support.boxSizingReliable() || val === elem.style[ name ] ); - - // Fall back to offsetWidth/Height when value is "auto" - // This happens for inline elements with no explicit setting (gh-3571) - if ( val === "auto" ) { - val = elem[ "offset" + name[ 0 ].toUpperCase() + name.slice( 1 ) ]; - } - - // Normalize "", auto, and prepare for extra - val = parseFloat( val ) || 0; - - // Use the active box-sizing model to add/subtract irrelevant styles - return ( val + - augmentWidthOrHeight( - elem, - name, - extra || ( isBorderBox ? "border" : "content" ), - valueIsBorderBox, - styles - ) - ) + "px"; -} - -jQuery.extend( { - - // Add in style property hooks for overriding the default - // behavior of getting and setting a style property - cssHooks: { - opacity: { - get: function( elem, computed ) { - if ( computed ) { - - // We should always get a number back from opacity - var ret = curCSS( elem, "opacity" ); - return ret === "" ? "1" : ret; - } - } - } - }, - - // Don't automatically add "px" to these possibly-unitless properties - cssNumber: { - "animationIterationCount": true, - "columnCount": true, - "fillOpacity": true, - "flexGrow": true, - "flexShrink": true, - "fontWeight": true, - "lineHeight": true, - "opacity": true, - "order": true, - "orphans": true, - "widows": true, - "zIndex": true, - "zoom": true - }, - - // Add in properties whose names you wish to fix before - // setting or getting the value - cssProps: { - "float": "cssFloat" - }, - - // Get and set the style property on a DOM Node - style: function( elem, name, value, extra ) { - - // Don't set styles on text and comment nodes - if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { - return; - } - - // Make sure that we're working with the right name - var ret, type, hooks, - origName = jQuery.camelCase( name ), - isCustomProp = rcustomProp.test( name ), - style = elem.style; - - // Make sure that we're working with the right name. We don't - // want to query the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Gets hook for the prefixed version, then unprefixed version - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // Check if we're setting a value - if ( value !== undefined ) { - type = typeof value; - - // Convert "+=" or "-=" to relative numbers (#7345) - if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { - value = adjustCSS( elem, name, ret ); - - // Fixes bug #9237 - type = "number"; - } - - // Make sure that null and NaN values aren't set (#7116) - if ( value == null || value !== value ) { - return; - } - - // If a number was passed in, add the unit (except for certain CSS properties) - if ( type === "number" ) { - value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); - } - - // background-* props affect original clone's values - if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { - style[ name ] = "inherit"; - } - - // If a hook was provided, use that value, otherwise just set the specified value - if ( !hooks || !( "set" in hooks ) || - ( value = hooks.set( elem, value, extra ) ) !== undefined ) { - - if ( isCustomProp ) { - style.setProperty( name, value ); - } else { - style[ name ] = value; - } - } - - } else { - - // If a hook was provided get the non-computed value from there - if ( hooks && "get" in hooks && - ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { - - return ret; - } - - // Otherwise just get the value from the style object - return style[ name ]; - } - }, - - css: function( elem, name, extra, styles ) { - var val, num, hooks, - origName = jQuery.camelCase( name ), - isCustomProp = rcustomProp.test( name ); - - // Make sure that we're working with the right name. We don't - // want to modify the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Try prefixed name followed by the unprefixed name - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // If a hook was provided get the computed value from there - if ( hooks && "get" in hooks ) { - val = hooks.get( elem, true, extra ); - } - - // Otherwise, if a way to get the computed value exists, use that - if ( val === undefined ) { - val = curCSS( elem, name, styles ); - } - - // Convert "normal" to computed value - if ( val === "normal" && name in cssNormalTransform ) { - val = cssNormalTransform[ name ]; - } - - // Make numeric if forced or a qualifier was provided and val looks numeric - if ( extra === "" || extra ) { - num = parseFloat( val ); - return extra === true || isFinite( num ) ? num || 0 : val; - } - - return val; - } -} ); - -jQuery.each( [ "height", "width" ], function( i, name ) { - jQuery.cssHooks[ name ] = { - get: function( elem, computed, extra ) { - if ( computed ) { - - // Certain elements can have dimension info if we invisibly show them - // but it must have a current display style that would benefit - return rdisplayswap.test( jQuery.css( elem, "display" ) ) && - - // Support: Safari 8+ - // Table columns in Safari have non-zero offsetWidth & zero - // getBoundingClientRect().width unless display is changed. - // Support: IE <=11 only - // Running getBoundingClientRect on a disconnected node - // in IE throws an error. - ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? - swap( elem, cssShow, function() { - return getWidthOrHeight( elem, name, extra ); - } ) : - getWidthOrHeight( elem, name, extra ); - } - }, - - set: function( elem, value, extra ) { - var matches, - styles = extra && getStyles( elem ), - subtract = extra && augmentWidthOrHeight( - elem, - name, - extra, - jQuery.css( elem, "boxSizing", false, styles ) === "border-box", - styles - ); - - // Convert to pixels if value adjustment is needed - if ( subtract && ( matches = rcssNum.exec( value ) ) && - ( matches[ 3 ] || "px" ) !== "px" ) { - - elem.style[ name ] = value; - value = jQuery.css( elem, name ); - } - - return setPositiveNumber( elem, value, subtract ); - } - }; -} ); - -jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, - function( elem, computed ) { - if ( computed ) { - return ( parseFloat( curCSS( elem, "marginLeft" ) ) || - elem.getBoundingClientRect().left - - swap( elem, { marginLeft: 0 }, function() { - return elem.getBoundingClientRect().left; - } ) - ) + "px"; - } - } -); - -// These hooks are used by animate to expand properties -jQuery.each( { - margin: "", - padding: "", - border: "Width" -}, function( prefix, suffix ) { - jQuery.cssHooks[ prefix + suffix ] = { - expand: function( value ) { - var i = 0, - expanded = {}, - - // Assumes a single number if not a string - parts = typeof value === "string" ? value.split( " " ) : [ value ]; - - for ( ; i < 4; i++ ) { - expanded[ prefix + cssExpand[ i ] + suffix ] = - parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; - } - - return expanded; - } - }; - - if ( !rmargin.test( prefix ) ) { - jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; - } -} ); - -jQuery.fn.extend( { - css: function( name, value ) { - return access( this, function( elem, name, value ) { - var styles, len, - map = {}, - i = 0; - - if ( Array.isArray( name ) ) { - styles = getStyles( elem ); - len = name.length; - - for ( ; i < len; i++ ) { - map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); - } - - return map; - } - - return value !== undefined ? - jQuery.style( elem, name, value ) : - jQuery.css( elem, name ); - }, name, value, arguments.length > 1 ); - } -} ); - - -function Tween( elem, options, prop, end, easing ) { - return new Tween.prototype.init( elem, options, prop, end, easing ); -} -jQuery.Tween = Tween; - -Tween.prototype = { - constructor: Tween, - init: function( elem, options, prop, end, easing, unit ) { - this.elem = elem; - this.prop = prop; - this.easing = easing || jQuery.easing._default; - this.options = options; - this.start = this.now = this.cur(); - this.end = end; - this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); - }, - cur: function() { - var hooks = Tween.propHooks[ this.prop ]; - - return hooks && hooks.get ? - hooks.get( this ) : - Tween.propHooks._default.get( this ); - }, - run: function( percent ) { - var eased, - hooks = Tween.propHooks[ this.prop ]; - - if ( this.options.duration ) { - this.pos = eased = jQuery.easing[ this.easing ]( - percent, this.options.duration * percent, 0, 1, this.options.duration - ); - } else { - this.pos = eased = percent; - } - this.now = ( this.end - this.start ) * eased + this.start; - - if ( this.options.step ) { - this.options.step.call( this.elem, this.now, this ); - } - - if ( hooks && hooks.set ) { - hooks.set( this ); - } else { - Tween.propHooks._default.set( this ); - } - return this; - } -}; - -Tween.prototype.init.prototype = Tween.prototype; - -Tween.propHooks = { - _default: { - get: function( tween ) { - var result; - - // Use a property on the element directly when it is not a DOM element, - // or when there is no matching style property that exists. - if ( tween.elem.nodeType !== 1 || - tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { - return tween.elem[ tween.prop ]; - } - - // Passing an empty string as a 3rd parameter to .css will automatically - // attempt a parseFloat and fallback to a string if the parse fails. - // Simple values such as "10px" are parsed to Float; - // complex values such as "rotate(1rad)" are returned as-is. - result = jQuery.css( tween.elem, tween.prop, "" ); - - // Empty strings, null, undefined and "auto" are converted to 0. - return !result || result === "auto" ? 0 : result; - }, - set: function( tween ) { - - // Use step hook for back compat. - // Use cssHook if its there. - // Use .style if available and use plain properties where available. - if ( jQuery.fx.step[ tween.prop ] ) { - jQuery.fx.step[ tween.prop ]( tween ); - } else if ( tween.elem.nodeType === 1 && - ( tween.elem.style[ jQuery.cssProps[ tween.prop ] ] != null || - jQuery.cssHooks[ tween.prop ] ) ) { - jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); - } else { - tween.elem[ tween.prop ] = tween.now; - } - } - } -}; - -// Support: IE <=9 only -// Panic based approach to setting things on disconnected nodes -Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { - set: function( tween ) { - if ( tween.elem.nodeType && tween.elem.parentNode ) { - tween.elem[ tween.prop ] = tween.now; - } - } -}; - -jQuery.easing = { - linear: function( p ) { - return p; - }, - swing: function( p ) { - return 0.5 - Math.cos( p * Math.PI ) / 2; - }, - _default: "swing" -}; - -jQuery.fx = Tween.prototype.init; - -// Back compat <1.8 extension point -jQuery.fx.step = {}; - - - - -var - fxNow, inProgress, - rfxtypes = /^(?:toggle|show|hide)$/, - rrun = /queueHooks$/; - -function schedule() { - if ( inProgress ) { - if ( document.hidden === false && window.requestAnimationFrame ) { - window.requestAnimationFrame( schedule ); - } else { - window.setTimeout( schedule, jQuery.fx.interval ); - } - - jQuery.fx.tick(); - } -} - -// Animations created synchronously will run synchronously -function createFxNow() { - window.setTimeout( function() { - fxNow = undefined; - } ); - return ( fxNow = jQuery.now() ); -} - -// Generate parameters to create a standard animation -function genFx( type, includeWidth ) { - var which, - i = 0, - attrs = { height: type }; - - // If we include width, step value is 1 to do all cssExpand values, - // otherwise step value is 2 to skip over Left and Right - includeWidth = includeWidth ? 1 : 0; - for ( ; i < 4; i += 2 - includeWidth ) { - which = cssExpand[ i ]; - attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; - } - - if ( includeWidth ) { - attrs.opacity = attrs.width = type; - } - - return attrs; -} - -function createTween( value, prop, animation ) { - var tween, - collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), - index = 0, - length = collection.length; - for ( ; index < length; index++ ) { - if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { - - // We're done with this property - return tween; - } - } -} - -function defaultPrefilter( elem, props, opts ) { - var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, - isBox = "width" in props || "height" in props, - anim = this, - orig = {}, - style = elem.style, - hidden = elem.nodeType && isHiddenWithinTree( elem ), - dataShow = dataPriv.get( elem, "fxshow" ); - - // Queue-skipping animations hijack the fx hooks - if ( !opts.queue ) { - hooks = jQuery._queueHooks( elem, "fx" ); - if ( hooks.unqueued == null ) { - hooks.unqueued = 0; - oldfire = hooks.empty.fire; - hooks.empty.fire = function() { - if ( !hooks.unqueued ) { - oldfire(); - } - }; - } - hooks.unqueued++; - - anim.always( function() { - - // Ensure the complete handler is called before this completes - anim.always( function() { - hooks.unqueued--; - if ( !jQuery.queue( elem, "fx" ).length ) { - hooks.empty.fire(); - } - } ); - } ); - } - - // Detect show/hide animations - for ( prop in props ) { - value = props[ prop ]; - if ( rfxtypes.test( value ) ) { - delete props[ prop ]; - toggle = toggle || value === "toggle"; - if ( value === ( hidden ? "hide" : "show" ) ) { - - // Pretend to be hidden if this is a "show" and - // there is still data from a stopped show/hide - if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { - hidden = true; - - // Ignore all other no-op show/hide data - } else { - continue; - } - } - orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); - } - } - - // Bail out if this is a no-op like .hide().hide() - propTween = !jQuery.isEmptyObject( props ); - if ( !propTween && jQuery.isEmptyObject( orig ) ) { - return; - } - - // Restrict "overflow" and "display" styles during box animations - if ( isBox && elem.nodeType === 1 ) { - - // Support: IE <=9 - 11, Edge 12 - 13 - // Record all 3 overflow attributes because IE does not infer the shorthand - // from identically-valued overflowX and overflowY - opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; - - // Identify a display type, preferring old show/hide data over the CSS cascade - restoreDisplay = dataShow && dataShow.display; - if ( restoreDisplay == null ) { - restoreDisplay = dataPriv.get( elem, "display" ); - } - display = jQuery.css( elem, "display" ); - if ( display === "none" ) { - if ( restoreDisplay ) { - display = restoreDisplay; - } else { - - // Get nonempty value(s) by temporarily forcing visibility - showHide( [ elem ], true ); - restoreDisplay = elem.style.display || restoreDisplay; - display = jQuery.css( elem, "display" ); - showHide( [ elem ] ); - } - } - - // Animate inline elements as inline-block - if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { - if ( jQuery.css( elem, "float" ) === "none" ) { - - // Restore the original display value at the end of pure show/hide animations - if ( !propTween ) { - anim.done( function() { - style.display = restoreDisplay; - } ); - if ( restoreDisplay == null ) { - display = style.display; - restoreDisplay = display === "none" ? "" : display; - } - } - style.display = "inline-block"; - } - } - } - - if ( opts.overflow ) { - style.overflow = "hidden"; - anim.always( function() { - style.overflow = opts.overflow[ 0 ]; - style.overflowX = opts.overflow[ 1 ]; - style.overflowY = opts.overflow[ 2 ]; - } ); - } - - // Implement show/hide animations - propTween = false; - for ( prop in orig ) { - - // General show/hide setup for this element animation - if ( !propTween ) { - if ( dataShow ) { - if ( "hidden" in dataShow ) { - hidden = dataShow.hidden; - } - } else { - dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); - } - - // Store hidden/visible for toggle so `.stop().toggle()` "reverses" - if ( toggle ) { - dataShow.hidden = !hidden; - } - - // Show elements before animating them - if ( hidden ) { - showHide( [ elem ], true ); - } - - /* eslint-disable no-loop-func */ - - anim.done( function() { - - /* eslint-enable no-loop-func */ - - // The final step of a "hide" animation is actually hiding the element - if ( !hidden ) { - showHide( [ elem ] ); - } - dataPriv.remove( elem, "fxshow" ); - for ( prop in orig ) { - jQuery.style( elem, prop, orig[ prop ] ); - } - } ); - } - - // Per-property setup - propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); - if ( !( prop in dataShow ) ) { - dataShow[ prop ] = propTween.start; - if ( hidden ) { - propTween.end = propTween.start; - propTween.start = 0; - } - } - } -} - -function propFilter( props, specialEasing ) { - var index, name, easing, value, hooks; - - // camelCase, specialEasing and expand cssHook pass - for ( index in props ) { - name = jQuery.camelCase( index ); - easing = specialEasing[ name ]; - value = props[ index ]; - if ( Array.isArray( value ) ) { - easing = value[ 1 ]; - value = props[ index ] = value[ 0 ]; - } - - if ( index !== name ) { - props[ name ] = value; - delete props[ index ]; - } - - hooks = jQuery.cssHooks[ name ]; - if ( hooks && "expand" in hooks ) { - value = hooks.expand( value ); - delete props[ name ]; - - // Not quite $.extend, this won't overwrite existing keys. - // Reusing 'index' because we have the correct "name" - for ( index in value ) { - if ( !( index in props ) ) { - props[ index ] = value[ index ]; - specialEasing[ index ] = easing; - } - } - } else { - specialEasing[ name ] = easing; - } - } -} - -function Animation( elem, properties, options ) { - var result, - stopped, - index = 0, - length = Animation.prefilters.length, - deferred = jQuery.Deferred().always( function() { - - // Don't match elem in the :animated selector - delete tick.elem; - } ), - tick = function() { - if ( stopped ) { - return false; - } - var currentTime = fxNow || createFxNow(), - remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), - - // Support: Android 2.3 only - // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) - temp = remaining / animation.duration || 0, - percent = 1 - temp, - index = 0, - length = animation.tweens.length; - - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( percent ); - } - - deferred.notifyWith( elem, [ animation, percent, remaining ] ); - - // If there's more to do, yield - if ( percent < 1 && length ) { - return remaining; - } - - // If this was an empty animation, synthesize a final progress notification - if ( !length ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - } - - // Resolve the animation and report its conclusion - deferred.resolveWith( elem, [ animation ] ); - return false; - }, - animation = deferred.promise( { - elem: elem, - props: jQuery.extend( {}, properties ), - opts: jQuery.extend( true, { - specialEasing: {}, - easing: jQuery.easing._default - }, options ), - originalProperties: properties, - originalOptions: options, - startTime: fxNow || createFxNow(), - duration: options.duration, - tweens: [], - createTween: function( prop, end ) { - var tween = jQuery.Tween( elem, animation.opts, prop, end, - animation.opts.specialEasing[ prop ] || animation.opts.easing ); - animation.tweens.push( tween ); - return tween; - }, - stop: function( gotoEnd ) { - var index = 0, - - // If we are going to the end, we want to run all the tweens - // otherwise we skip this part - length = gotoEnd ? animation.tweens.length : 0; - if ( stopped ) { - return this; - } - stopped = true; - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( 1 ); - } - - // Resolve when we played the last frame; otherwise, reject - if ( gotoEnd ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - deferred.resolveWith( elem, [ animation, gotoEnd ] ); - } else { - deferred.rejectWith( elem, [ animation, gotoEnd ] ); - } - return this; - } - } ), - props = animation.props; - - propFilter( props, animation.opts.specialEasing ); - - for ( ; index < length; index++ ) { - result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); - if ( result ) { - if ( jQuery.isFunction( result.stop ) ) { - jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = - jQuery.proxy( result.stop, result ); - } - return result; - } - } - - jQuery.map( props, createTween, animation ); - - if ( jQuery.isFunction( animation.opts.start ) ) { - animation.opts.start.call( elem, animation ); - } - - // Attach callbacks from options - animation - .progress( animation.opts.progress ) - .done( animation.opts.done, animation.opts.complete ) - .fail( animation.opts.fail ) - .always( animation.opts.always ); - - jQuery.fx.timer( - jQuery.extend( tick, { - elem: elem, - anim: animation, - queue: animation.opts.queue - } ) - ); - - return animation; -} - -jQuery.Animation = jQuery.extend( Animation, { - - tweeners: { - "*": [ function( prop, value ) { - var tween = this.createTween( prop, value ); - adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); - return tween; - } ] - }, - - tweener: function( props, callback ) { - if ( jQuery.isFunction( props ) ) { - callback = props; - props = [ "*" ]; - } else { - props = props.match( rnothtmlwhite ); - } - - var prop, - index = 0, - length = props.length; - - for ( ; index < length; index++ ) { - prop = props[ index ]; - Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; - Animation.tweeners[ prop ].unshift( callback ); - } - }, - - prefilters: [ defaultPrefilter ], - - prefilter: function( callback, prepend ) { - if ( prepend ) { - Animation.prefilters.unshift( callback ); - } else { - Animation.prefilters.push( callback ); - } - } -} ); - -jQuery.speed = function( speed, easing, fn ) { - var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { - complete: fn || !fn && easing || - jQuery.isFunction( speed ) && speed, - duration: speed, - easing: fn && easing || easing && !jQuery.isFunction( easing ) && easing - }; - - // Go to the end state if fx are off - if ( jQuery.fx.off ) { - opt.duration = 0; - - } else { - if ( typeof opt.duration !== "number" ) { - if ( opt.duration in jQuery.fx.speeds ) { - opt.duration = jQuery.fx.speeds[ opt.duration ]; - - } else { - opt.duration = jQuery.fx.speeds._default; - } - } - } - - // Normalize opt.queue - true/undefined/null -> "fx" - if ( opt.queue == null || opt.queue === true ) { - opt.queue = "fx"; - } - - // Queueing - opt.old = opt.complete; - - opt.complete = function() { - if ( jQuery.isFunction( opt.old ) ) { - opt.old.call( this ); - } - - if ( opt.queue ) { - jQuery.dequeue( this, opt.queue ); - } - }; - - return opt; -}; - -jQuery.fn.extend( { - fadeTo: function( speed, to, easing, callback ) { - - // Show any hidden elements after setting opacity to 0 - return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() - - // Animate to the value specified - .end().animate( { opacity: to }, speed, easing, callback ); - }, - animate: function( prop, speed, easing, callback ) { - var empty = jQuery.isEmptyObject( prop ), - optall = jQuery.speed( speed, easing, callback ), - doAnimation = function() { - - // Operate on a copy of prop so per-property easing won't be lost - var anim = Animation( this, jQuery.extend( {}, prop ), optall ); - - // Empty animations, or finishing resolves immediately - if ( empty || dataPriv.get( this, "finish" ) ) { - anim.stop( true ); - } - }; - doAnimation.finish = doAnimation; - - return empty || optall.queue === false ? - this.each( doAnimation ) : - this.queue( optall.queue, doAnimation ); - }, - stop: function( type, clearQueue, gotoEnd ) { - var stopQueue = function( hooks ) { - var stop = hooks.stop; - delete hooks.stop; - stop( gotoEnd ); - }; - - if ( typeof type !== "string" ) { - gotoEnd = clearQueue; - clearQueue = type; - type = undefined; - } - if ( clearQueue && type !== false ) { - this.queue( type || "fx", [] ); - } - - return this.each( function() { - var dequeue = true, - index = type != null && type + "queueHooks", - timers = jQuery.timers, - data = dataPriv.get( this ); - - if ( index ) { - if ( data[ index ] && data[ index ].stop ) { - stopQueue( data[ index ] ); - } - } else { - for ( index in data ) { - if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { - stopQueue( data[ index ] ); - } - } - } - - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && - ( type == null || timers[ index ].queue === type ) ) { - - timers[ index ].anim.stop( gotoEnd ); - dequeue = false; - timers.splice( index, 1 ); - } - } - - // Start the next in the queue if the last step wasn't forced. - // Timers currently will call their complete callbacks, which - // will dequeue but only if they were gotoEnd. - if ( dequeue || !gotoEnd ) { - jQuery.dequeue( this, type ); - } - } ); - }, - finish: function( type ) { - if ( type !== false ) { - type = type || "fx"; - } - return this.each( function() { - var index, - data = dataPriv.get( this ), - queue = data[ type + "queue" ], - hooks = data[ type + "queueHooks" ], - timers = jQuery.timers, - length = queue ? queue.length : 0; - - // Enable finishing flag on private data - data.finish = true; - - // Empty the queue first - jQuery.queue( this, type, [] ); - - if ( hooks && hooks.stop ) { - hooks.stop.call( this, true ); - } - - // Look for any active animations, and finish them - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && timers[ index ].queue === type ) { - timers[ index ].anim.stop( true ); - timers.splice( index, 1 ); - } - } - - // Look for any animations in the old queue and finish them - for ( index = 0; index < length; index++ ) { - if ( queue[ index ] && queue[ index ].finish ) { - queue[ index ].finish.call( this ); - } - } - - // Turn off finishing flag - delete data.finish; - } ); - } -} ); - -jQuery.each( [ "toggle", "show", "hide" ], function( i, name ) { - var cssFn = jQuery.fn[ name ]; - jQuery.fn[ name ] = function( speed, easing, callback ) { - return speed == null || typeof speed === "boolean" ? - cssFn.apply( this, arguments ) : - this.animate( genFx( name, true ), speed, easing, callback ); - }; -} ); - -// Generate shortcuts for custom animations -jQuery.each( { - slideDown: genFx( "show" ), - slideUp: genFx( "hide" ), - slideToggle: genFx( "toggle" ), - fadeIn: { opacity: "show" }, - fadeOut: { opacity: "hide" }, - fadeToggle: { opacity: "toggle" } -}, function( name, props ) { - jQuery.fn[ name ] = function( speed, easing, callback ) { - return this.animate( props, speed, easing, callback ); - }; -} ); - -jQuery.timers = []; -jQuery.fx.tick = function() { - var timer, - i = 0, - timers = jQuery.timers; - - fxNow = jQuery.now(); - - for ( ; i < timers.length; i++ ) { - timer = timers[ i ]; - - // Run the timer and safely remove it when done (allowing for external removal) - if ( !timer() && timers[ i ] === timer ) { - timers.splice( i--, 1 ); - } - } - - if ( !timers.length ) { - jQuery.fx.stop(); - } - fxNow = undefined; -}; - -jQuery.fx.timer = function( timer ) { - jQuery.timers.push( timer ); - jQuery.fx.start(); -}; - -jQuery.fx.interval = 13; -jQuery.fx.start = function() { - if ( inProgress ) { - return; - } - - inProgress = true; - schedule(); -}; - -jQuery.fx.stop = function() { - inProgress = null; -}; - -jQuery.fx.speeds = { - slow: 600, - fast: 200, - - // Default speed - _default: 400 -}; - - -// Based off of the plugin by Clint Helfers, with permission. -// https://web.archive.org/web/20100324014747/https://blindsignals.com/index.php/2009/07/jquery-delay/ -jQuery.fn.delay = function( time, type ) { - time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; - type = type || "fx"; - - return this.queue( type, function( next, hooks ) { - var timeout = window.setTimeout( next, time ); - hooks.stop = function() { - window.clearTimeout( timeout ); - }; - } ); -}; - - -( function() { - var input = document.createElement( "input" ), - select = document.createElement( "select" ), - opt = select.appendChild( document.createElement( "option" ) ); - - input.type = "checkbox"; - - // Support: Android <=4.3 only - // Default value for a checkbox should be "on" - support.checkOn = input.value !== ""; - - // Support: IE <=11 only - // Must access selectedIndex to make default options select - support.optSelected = opt.selected; - - // Support: IE <=11 only - // An input loses its value after becoming a radio - input = document.createElement( "input" ); - input.value = "t"; - input.type = "radio"; - support.radioValue = input.value === "t"; -} )(); - - -var boolHook, - attrHandle = jQuery.expr.attrHandle; - -jQuery.fn.extend( { - attr: function( name, value ) { - return access( this, jQuery.attr, name, value, arguments.length > 1 ); - }, - - removeAttr: function( name ) { - return this.each( function() { - jQuery.removeAttr( this, name ); - } ); - } -} ); - -jQuery.extend( { - attr: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set attributes on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - // Fallback to prop when attributes are not supported - if ( typeof elem.getAttribute === "undefined" ) { - return jQuery.prop( elem, name, value ); - } - - // Attribute hooks are determined by the lowercase version - // Grab necessary hook if one is defined - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - hooks = jQuery.attrHooks[ name.toLowerCase() ] || - ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); - } - - if ( value !== undefined ) { - if ( value === null ) { - jQuery.removeAttr( elem, name ); - return; - } - - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - elem.setAttribute( name, value + "" ); - return value; - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - ret = jQuery.find.attr( elem, name ); - - // Non-existent attributes return null, we normalize to undefined - return ret == null ? undefined : ret; - }, - - attrHooks: { - type: { - set: function( elem, value ) { - if ( !support.radioValue && value === "radio" && - nodeName( elem, "input" ) ) { - var val = elem.value; - elem.setAttribute( "type", value ); - if ( val ) { - elem.value = val; - } - return value; - } - } - } - }, - - removeAttr: function( elem, value ) { - var name, - i = 0, - - // Attribute names can contain non-HTML whitespace characters - // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 - attrNames = value && value.match( rnothtmlwhite ); - - if ( attrNames && elem.nodeType === 1 ) { - while ( ( name = attrNames[ i++ ] ) ) { - elem.removeAttribute( name ); - } - } - } -} ); - -// Hooks for boolean attributes -boolHook = { - set: function( elem, value, name ) { - if ( value === false ) { - - // Remove boolean attributes when set to false - jQuery.removeAttr( elem, name ); - } else { - elem.setAttribute( name, name ); - } - return name; - } -}; - -jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( i, name ) { - var getter = attrHandle[ name ] || jQuery.find.attr; - - attrHandle[ name ] = function( elem, name, isXML ) { - var ret, handle, - lowercaseName = name.toLowerCase(); - - if ( !isXML ) { - - // Avoid an infinite loop by temporarily removing this function from the getter - handle = attrHandle[ lowercaseName ]; - attrHandle[ lowercaseName ] = ret; - ret = getter( elem, name, isXML ) != null ? - lowercaseName : - null; - attrHandle[ lowercaseName ] = handle; - } - return ret; - }; -} ); - - - - -var rfocusable = /^(?:input|select|textarea|button)$/i, - rclickable = /^(?:a|area)$/i; - -jQuery.fn.extend( { - prop: function( name, value ) { - return access( this, jQuery.prop, name, value, arguments.length > 1 ); - }, - - removeProp: function( name ) { - return this.each( function() { - delete this[ jQuery.propFix[ name ] || name ]; - } ); - } -} ); - -jQuery.extend( { - prop: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set properties on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - - // Fix name and attach hooks - name = jQuery.propFix[ name ] || name; - hooks = jQuery.propHooks[ name ]; - } - - if ( value !== undefined ) { - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - return ( elem[ name ] = value ); - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - return elem[ name ]; - }, - - propHooks: { - tabIndex: { - get: function( elem ) { - - // Support: IE <=9 - 11 only - // elem.tabIndex doesn't always return the - // correct value when it hasn't been explicitly set - // https://web.archive.org/web/20141116233347/https://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ - // Use proper attribute retrieval(#12072) - var tabindex = jQuery.find.attr( elem, "tabindex" ); - - if ( tabindex ) { - return parseInt( tabindex, 10 ); - } - - if ( - rfocusable.test( elem.nodeName ) || - rclickable.test( elem.nodeName ) && - elem.href - ) { - return 0; - } - - return -1; - } - } - }, - - propFix: { - "for": "htmlFor", - "class": "className" - } -} ); - -// Support: IE <=11 only -// Accessing the selectedIndex property -// forces the browser to respect setting selected -// on the option -// The getter ensures a default option is selected -// when in an optgroup -// eslint rule "no-unused-expressions" is disabled for this code -// since it considers such accessions noop -if ( !support.optSelected ) { - jQuery.propHooks.selected = { - get: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent && parent.parentNode ) { - parent.parentNode.selectedIndex; - } - return null; - }, - set: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent ) { - parent.selectedIndex; - - if ( parent.parentNode ) { - parent.parentNode.selectedIndex; - } - } - } - }; -} - -jQuery.each( [ - "tabIndex", - "readOnly", - "maxLength", - "cellSpacing", - "cellPadding", - "rowSpan", - "colSpan", - "useMap", - "frameBorder", - "contentEditable" -], function() { - jQuery.propFix[ this.toLowerCase() ] = this; -} ); - - - - - // Strip and collapse whitespace according to HTML spec - // https://html.spec.whatwg.org/multipage/infrastructure.html#strip-and-collapse-whitespace - function stripAndCollapse( value ) { - var tokens = value.match( rnothtmlwhite ) || []; - return tokens.join( " " ); - } - - -function getClass( elem ) { - return elem.getAttribute && elem.getAttribute( "class" ) || ""; -} - -jQuery.fn.extend( { - addClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( jQuery.isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - if ( typeof value === "string" && value ) { - classes = value.match( rnothtmlwhite ) || []; - - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - if ( cur.indexOf( " " + clazz + " " ) < 0 ) { - cur += clazz + " "; - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - removeClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( jQuery.isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - if ( !arguments.length ) { - return this.attr( "class", "" ); - } - - if ( typeof value === "string" && value ) { - classes = value.match( rnothtmlwhite ) || []; - - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - - // This expression is here for better compressibility (see addClass) - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - - // Remove *all* instances - while ( cur.indexOf( " " + clazz + " " ) > -1 ) { - cur = cur.replace( " " + clazz + " ", " " ); - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - toggleClass: function( value, stateVal ) { - var type = typeof value; - - if ( typeof stateVal === "boolean" && type === "string" ) { - return stateVal ? this.addClass( value ) : this.removeClass( value ); - } - - if ( jQuery.isFunction( value ) ) { - return this.each( function( i ) { - jQuery( this ).toggleClass( - value.call( this, i, getClass( this ), stateVal ), - stateVal - ); - } ); - } - - return this.each( function() { - var className, i, self, classNames; - - if ( type === "string" ) { - - // Toggle individual class names - i = 0; - self = jQuery( this ); - classNames = value.match( rnothtmlwhite ) || []; - - while ( ( className = classNames[ i++ ] ) ) { - - // Check each className given, space separated list - if ( self.hasClass( className ) ) { - self.removeClass( className ); - } else { - self.addClass( className ); - } - } - - // Toggle whole class name - } else if ( value === undefined || type === "boolean" ) { - className = getClass( this ); - if ( className ) { - - // Store className if set - dataPriv.set( this, "__className__", className ); - } - - // If the element has a class name or if we're passed `false`, - // then remove the whole classname (if there was one, the above saved it). - // Otherwise bring back whatever was previously saved (if anything), - // falling back to the empty string if nothing was stored. - if ( this.setAttribute ) { - this.setAttribute( "class", - className || value === false ? - "" : - dataPriv.get( this, "__className__" ) || "" - ); - } - } - } ); - }, - - hasClass: function( selector ) { - var className, elem, - i = 0; - - className = " " + selector + " "; - while ( ( elem = this[ i++ ] ) ) { - if ( elem.nodeType === 1 && - ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { - return true; - } - } - - return false; - } -} ); - - - - -var rreturn = /\r/g; - -jQuery.fn.extend( { - val: function( value ) { - var hooks, ret, isFunction, - elem = this[ 0 ]; - - if ( !arguments.length ) { - if ( elem ) { - hooks = jQuery.valHooks[ elem.type ] || - jQuery.valHooks[ elem.nodeName.toLowerCase() ]; - - if ( hooks && - "get" in hooks && - ( ret = hooks.get( elem, "value" ) ) !== undefined - ) { - return ret; - } - - ret = elem.value; - - // Handle most common string cases - if ( typeof ret === "string" ) { - return ret.replace( rreturn, "" ); - } - - // Handle cases where value is null/undef or number - return ret == null ? "" : ret; - } - - return; - } - - isFunction = jQuery.isFunction( value ); - - return this.each( function( i ) { - var val; - - if ( this.nodeType !== 1 ) { - return; - } - - if ( isFunction ) { - val = value.call( this, i, jQuery( this ).val() ); - } else { - val = value; - } - - // Treat null/undefined as ""; convert numbers to string - if ( val == null ) { - val = ""; - - } else if ( typeof val === "number" ) { - val += ""; - - } else if ( Array.isArray( val ) ) { - val = jQuery.map( val, function( value ) { - return value == null ? "" : value + ""; - } ); - } - - hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; - - // If set returns undefined, fall back to normal setting - if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { - this.value = val; - } - } ); - } -} ); - -jQuery.extend( { - valHooks: { - option: { - get: function( elem ) { - - var val = jQuery.find.attr( elem, "value" ); - return val != null ? - val : - - // Support: IE <=10 - 11 only - // option.text throws exceptions (#14686, #14858) - // Strip and collapse whitespace - // https://html.spec.whatwg.org/#strip-and-collapse-whitespace - stripAndCollapse( jQuery.text( elem ) ); - } - }, - select: { - get: function( elem ) { - var value, option, i, - options = elem.options, - index = elem.selectedIndex, - one = elem.type === "select-one", - values = one ? null : [], - max = one ? index + 1 : options.length; - - if ( index < 0 ) { - i = max; - - } else { - i = one ? index : 0; - } - - // Loop through all the selected options - for ( ; i < max; i++ ) { - option = options[ i ]; - - // Support: IE <=9 only - // IE8-9 doesn't update selected after form reset (#2551) - if ( ( option.selected || i === index ) && - - // Don't return options that are disabled or in a disabled optgroup - !option.disabled && - ( !option.parentNode.disabled || - !nodeName( option.parentNode, "optgroup" ) ) ) { - - // Get the specific value for the option - value = jQuery( option ).val(); - - // We don't need an array for one selects - if ( one ) { - return value; - } - - // Multi-Selects return an array - values.push( value ); - } - } - - return values; - }, - - set: function( elem, value ) { - var optionSet, option, - options = elem.options, - values = jQuery.makeArray( value ), - i = options.length; - - while ( i-- ) { - option = options[ i ]; - - /* eslint-disable no-cond-assign */ - - if ( option.selected = - jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 - ) { - optionSet = true; - } - - /* eslint-enable no-cond-assign */ - } - - // Force browsers to behave consistently when non-matching value is set - if ( !optionSet ) { - elem.selectedIndex = -1; - } - return values; - } - } - } -} ); - -// Radios and checkboxes getter/setter -jQuery.each( [ "radio", "checkbox" ], function() { - jQuery.valHooks[ this ] = { - set: function( elem, value ) { - if ( Array.isArray( value ) ) { - return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); - } - } - }; - if ( !support.checkOn ) { - jQuery.valHooks[ this ].get = function( elem ) { - return elem.getAttribute( "value" ) === null ? "on" : elem.value; - }; - } -} ); - - - - -// Return jQuery for attributes-only inclusion - - -var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/; - -jQuery.extend( jQuery.event, { - - trigger: function( event, data, elem, onlyHandlers ) { - - var i, cur, tmp, bubbleType, ontype, handle, special, - eventPath = [ elem || document ], - type = hasOwn.call( event, "type" ) ? event.type : event, - namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; - - cur = tmp = elem = elem || document; - - // Don't do events on text and comment nodes - if ( elem.nodeType === 3 || elem.nodeType === 8 ) { - return; - } - - // focus/blur morphs to focusin/out; ensure we're not firing them right now - if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { - return; - } - - if ( type.indexOf( "." ) > -1 ) { - - // Namespaced trigger; create a regexp to match event type in handle() - namespaces = type.split( "." ); - type = namespaces.shift(); - namespaces.sort(); - } - ontype = type.indexOf( ":" ) < 0 && "on" + type; - - // Caller can pass in a jQuery.Event object, Object, or just an event type string - event = event[ jQuery.expando ] ? - event : - new jQuery.Event( type, typeof event === "object" && event ); - - // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) - event.isTrigger = onlyHandlers ? 2 : 3; - event.namespace = namespaces.join( "." ); - event.rnamespace = event.namespace ? - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : - null; - - // Clean up the event in case it is being reused - event.result = undefined; - if ( !event.target ) { - event.target = elem; - } - - // Clone any incoming data and prepend the event, creating the handler arg list - data = data == null ? - [ event ] : - jQuery.makeArray( data, [ event ] ); - - // Allow special events to draw outside the lines - special = jQuery.event.special[ type ] || {}; - if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { - return; - } - - // Determine event propagation path in advance, per W3C events spec (#9951) - // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) - if ( !onlyHandlers && !special.noBubble && !jQuery.isWindow( elem ) ) { - - bubbleType = special.delegateType || type; - if ( !rfocusMorph.test( bubbleType + type ) ) { - cur = cur.parentNode; - } - for ( ; cur; cur = cur.parentNode ) { - eventPath.push( cur ); - tmp = cur; - } - - // Only add window if we got to document (e.g., not plain obj or detached DOM) - if ( tmp === ( elem.ownerDocument || document ) ) { - eventPath.push( tmp.defaultView || tmp.parentWindow || window ); - } - } - - // Fire handlers on the event path - i = 0; - while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { - - event.type = i > 1 ? - bubbleType : - special.bindType || type; - - // jQuery handler - handle = ( dataPriv.get( cur, "events" ) || {} )[ event.type ] && - dataPriv.get( cur, "handle" ); - if ( handle ) { - handle.apply( cur, data ); - } - - // Native handler - handle = ontype && cur[ ontype ]; - if ( handle && handle.apply && acceptData( cur ) ) { - event.result = handle.apply( cur, data ); - if ( event.result === false ) { - event.preventDefault(); - } - } - } - event.type = type; - - // If nobody prevented the default action, do it now - if ( !onlyHandlers && !event.isDefaultPrevented() ) { - - if ( ( !special._default || - special._default.apply( eventPath.pop(), data ) === false ) && - acceptData( elem ) ) { - - // Call a native DOM method on the target with the same name as the event. - // Don't do default actions on window, that's where global variables be (#6170) - if ( ontype && jQuery.isFunction( elem[ type ] ) && !jQuery.isWindow( elem ) ) { - - // Don't re-trigger an onFOO event when we call its FOO() method - tmp = elem[ ontype ]; - - if ( tmp ) { - elem[ ontype ] = null; - } - - // Prevent re-triggering of the same event, since we already bubbled it above - jQuery.event.triggered = type; - elem[ type ](); - jQuery.event.triggered = undefined; - - if ( tmp ) { - elem[ ontype ] = tmp; - } - } - } - } - - return event.result; - }, - - // Piggyback on a donor event to simulate a different one - // Used only for `focus(in | out)` events - simulate: function( type, elem, event ) { - var e = jQuery.extend( - new jQuery.Event(), - event, - { - type: type, - isSimulated: true - } - ); - - jQuery.event.trigger( e, null, elem ); - } - -} ); - -jQuery.fn.extend( { - - trigger: function( type, data ) { - return this.each( function() { - jQuery.event.trigger( type, data, this ); - } ); - }, - triggerHandler: function( type, data ) { - var elem = this[ 0 ]; - if ( elem ) { - return jQuery.event.trigger( type, data, elem, true ); - } - } -} ); - - -jQuery.each( ( "blur focus focusin focusout resize scroll click dblclick " + - "mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave " + - "change select submit keydown keypress keyup contextmenu" ).split( " " ), - function( i, name ) { - - // Handle event binding - jQuery.fn[ name ] = function( data, fn ) { - return arguments.length > 0 ? - this.on( name, null, data, fn ) : - this.trigger( name ); - }; -} ); - -jQuery.fn.extend( { - hover: function( fnOver, fnOut ) { - return this.mouseenter( fnOver ).mouseleave( fnOut || fnOver ); - } -} ); - - - - -support.focusin = "onfocusin" in window; - - -// Support: Firefox <=44 -// Firefox doesn't have focus(in | out) events -// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 -// -// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 -// focus(in | out) events fire after focus & blur events, -// which is spec violation - https://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order -// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 -if ( !support.focusin ) { - jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { - - // Attach a single capturing handler on the document while someone wants focusin/focusout - var handler = function( event ) { - jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); - }; - - jQuery.event.special[ fix ] = { - setup: function() { - var doc = this.ownerDocument || this, - attaches = dataPriv.access( doc, fix ); - - if ( !attaches ) { - doc.addEventListener( orig, handler, true ); - } - dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); - }, - teardown: function() { - var doc = this.ownerDocument || this, - attaches = dataPriv.access( doc, fix ) - 1; - - if ( !attaches ) { - doc.removeEventListener( orig, handler, true ); - dataPriv.remove( doc, fix ); - - } else { - dataPriv.access( doc, fix, attaches ); - } - } - }; - } ); -} -var location = window.location; - -var nonce = jQuery.now(); - -var rquery = ( /\?/ ); - - - -// Cross-browser xml parsing -jQuery.parseXML = function( data ) { - var xml; - if ( !data || typeof data !== "string" ) { - return null; - } - - // Support: IE 9 - 11 only - // IE throws on parseFromString with invalid input. - try { - xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); - } catch ( e ) { - xml = undefined; - } - - if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { - jQuery.error( "Invalid XML: " + data ); - } - return xml; -}; - - -var - rbracket = /\[\]$/, - rCRLF = /\r?\n/g, - rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, - rsubmittable = /^(?:input|select|textarea|keygen)/i; - -function buildParams( prefix, obj, traditional, add ) { - var name; - - if ( Array.isArray( obj ) ) { - - // Serialize array item. - jQuery.each( obj, function( i, v ) { - if ( traditional || rbracket.test( prefix ) ) { - - // Treat each array item as a scalar. - add( prefix, v ); - - } else { - - // Item is non-scalar (array or object), encode its numeric index. - buildParams( - prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", - v, - traditional, - add - ); - } - } ); - - } else if ( !traditional && jQuery.type( obj ) === "object" ) { - - // Serialize object item. - for ( name in obj ) { - buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); - } - - } else { - - // Serialize scalar item. - add( prefix, obj ); - } -} - -// Serialize an array of form elements or a set of -// key/values into a query string -jQuery.param = function( a, traditional ) { - var prefix, - s = [], - add = function( key, valueOrFunction ) { - - // If value is a function, invoke it and use its return value - var value = jQuery.isFunction( valueOrFunction ) ? - valueOrFunction() : - valueOrFunction; - - s[ s.length ] = encodeURIComponent( key ) + "=" + - encodeURIComponent( value == null ? "" : value ); - }; - - // If an array was passed in, assume that it is an array of form elements. - if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { - - // Serialize the form elements - jQuery.each( a, function() { - add( this.name, this.value ); - } ); - - } else { - - // If traditional, encode the "old" way (the way 1.3.2 or older - // did it), otherwise encode params recursively. - for ( prefix in a ) { - buildParams( prefix, a[ prefix ], traditional, add ); - } - } - - // Return the resulting serialization - return s.join( "&" ); -}; - -jQuery.fn.extend( { - serialize: function() { - return jQuery.param( this.serializeArray() ); - }, - serializeArray: function() { - return this.map( function() { - - // Can add propHook for "elements" to filter or add form elements - var elements = jQuery.prop( this, "elements" ); - return elements ? jQuery.makeArray( elements ) : this; - } ) - .filter( function() { - var type = this.type; - - // Use .is( ":disabled" ) so that fieldset[disabled] works - return this.name && !jQuery( this ).is( ":disabled" ) && - rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && - ( this.checked || !rcheckableType.test( type ) ); - } ) - .map( function( i, elem ) { - var val = jQuery( this ).val(); - - if ( val == null ) { - return null; - } - - if ( Array.isArray( val ) ) { - return jQuery.map( val, function( val ) { - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ); - } - - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ).get(); - } -} ); - - -var - r20 = /%20/g, - rhash = /#.*$/, - rantiCache = /([?&])_=[^&]*/, - rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, - - // #7653, #8125, #8152: local protocol detection - rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, - rnoContent = /^(?:GET|HEAD)$/, - rprotocol = /^\/\//, - - /* Prefilters - * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) - * 2) These are called: - * - BEFORE asking for a transport - * - AFTER param serialization (s.data is a string if s.processData is true) - * 3) key is the dataType - * 4) the catchall symbol "*" can be used - * 5) execution will start with transport dataType and THEN continue down to "*" if needed - */ - prefilters = {}, - - /* Transports bindings - * 1) key is the dataType - * 2) the catchall symbol "*" can be used - * 3) selection will start with transport dataType and THEN go to "*" if needed - */ - transports = {}, - - // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression - allTypes = "*/".concat( "*" ), - - // Anchor tag for parsing the document origin - originAnchor = document.createElement( "a" ); - originAnchor.href = location.href; - -// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport -function addToPrefiltersOrTransports( structure ) { - - // dataTypeExpression is optional and defaults to "*" - return function( dataTypeExpression, func ) { - - if ( typeof dataTypeExpression !== "string" ) { - func = dataTypeExpression; - dataTypeExpression = "*"; - } - - var dataType, - i = 0, - dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; - - if ( jQuery.isFunction( func ) ) { - - // For each dataType in the dataTypeExpression - while ( ( dataType = dataTypes[ i++ ] ) ) { - - // Prepend if requested - if ( dataType[ 0 ] === "+" ) { - dataType = dataType.slice( 1 ) || "*"; - ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); - - // Otherwise append - } else { - ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); - } - } - } - }; -} - -// Base inspection function for prefilters and transports -function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { - - var inspected = {}, - seekingTransport = ( structure === transports ); - - function inspect( dataType ) { - var selected; - inspected[ dataType ] = true; - jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { - var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); - if ( typeof dataTypeOrTransport === "string" && - !seekingTransport && !inspected[ dataTypeOrTransport ] ) { - - options.dataTypes.unshift( dataTypeOrTransport ); - inspect( dataTypeOrTransport ); - return false; - } else if ( seekingTransport ) { - return !( selected = dataTypeOrTransport ); - } - } ); - return selected; - } - - return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); -} - -// A special extend for ajax options -// that takes "flat" options (not to be deep extended) -// Fixes #9887 -function ajaxExtend( target, src ) { - var key, deep, - flatOptions = jQuery.ajaxSettings.flatOptions || {}; - - for ( key in src ) { - if ( src[ key ] !== undefined ) { - ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; - } - } - if ( deep ) { - jQuery.extend( true, target, deep ); - } - - return target; -} - -/* Handles responses to an ajax request: - * - finds the right dataType (mediates between content-type and expected dataType) - * - returns the corresponding response - */ -function ajaxHandleResponses( s, jqXHR, responses ) { - - var ct, type, finalDataType, firstDataType, - contents = s.contents, - dataTypes = s.dataTypes; - - // Remove auto dataType and get content-type in the process - while ( dataTypes[ 0 ] === "*" ) { - dataTypes.shift(); - if ( ct === undefined ) { - ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); - } - } - - // Check if we're dealing with a known content-type - if ( ct ) { - for ( type in contents ) { - if ( contents[ type ] && contents[ type ].test( ct ) ) { - dataTypes.unshift( type ); - break; - } - } - } - - // Check to see if we have a response for the expected dataType - if ( dataTypes[ 0 ] in responses ) { - finalDataType = dataTypes[ 0 ]; - } else { - - // Try convertible dataTypes - for ( type in responses ) { - if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { - finalDataType = type; - break; - } - if ( !firstDataType ) { - firstDataType = type; - } - } - - // Or just use first one - finalDataType = finalDataType || firstDataType; - } - - // If we found a dataType - // We add the dataType to the list if needed - // and return the corresponding response - if ( finalDataType ) { - if ( finalDataType !== dataTypes[ 0 ] ) { - dataTypes.unshift( finalDataType ); - } - return responses[ finalDataType ]; - } -} - -/* Chain conversions given the request and the original response - * Also sets the responseXXX fields on the jqXHR instance - */ -function ajaxConvert( s, response, jqXHR, isSuccess ) { - var conv2, current, conv, tmp, prev, - converters = {}, - - // Work with a copy of dataTypes in case we need to modify it for conversion - dataTypes = s.dataTypes.slice(); - - // Create converters map with lowercased keys - if ( dataTypes[ 1 ] ) { - for ( conv in s.converters ) { - converters[ conv.toLowerCase() ] = s.converters[ conv ]; - } - } - - current = dataTypes.shift(); - - // Convert to each sequential dataType - while ( current ) { - - if ( s.responseFields[ current ] ) { - jqXHR[ s.responseFields[ current ] ] = response; - } - - // Apply the dataFilter if provided - if ( !prev && isSuccess && s.dataFilter ) { - response = s.dataFilter( response, s.dataType ); - } - - prev = current; - current = dataTypes.shift(); - - if ( current ) { - - // There's only work to do if current dataType is non-auto - if ( current === "*" ) { - - current = prev; - - // Convert response if prev dataType is non-auto and differs from current - } else if ( prev !== "*" && prev !== current ) { - - // Seek a direct converter - conv = converters[ prev + " " + current ] || converters[ "* " + current ]; - - // If none found, seek a pair - if ( !conv ) { - for ( conv2 in converters ) { - - // If conv2 outputs current - tmp = conv2.split( " " ); - if ( tmp[ 1 ] === current ) { - - // If prev can be converted to accepted input - conv = converters[ prev + " " + tmp[ 0 ] ] || - converters[ "* " + tmp[ 0 ] ]; - if ( conv ) { - - // Condense equivalence converters - if ( conv === true ) { - conv = converters[ conv2 ]; - - // Otherwise, insert the intermediate dataType - } else if ( converters[ conv2 ] !== true ) { - current = tmp[ 0 ]; - dataTypes.unshift( tmp[ 1 ] ); - } - break; - } - } - } - } - - // Apply converter (if not an equivalence) - if ( conv !== true ) { - - // Unless errors are allowed to bubble, catch and return them - if ( conv && s.throws ) { - response = conv( response ); - } else { - try { - response = conv( response ); - } catch ( e ) { - return { - state: "parsererror", - error: conv ? e : "No conversion from " + prev + " to " + current - }; - } - } - } - } - } - } - - return { state: "success", data: response }; -} - -jQuery.extend( { - - // Counter for holding the number of active queries - active: 0, - - // Last-Modified header cache for next request - lastModified: {}, - etag: {}, - - ajaxSettings: { - url: location.href, - type: "GET", - isLocal: rlocalProtocol.test( location.protocol ), - global: true, - processData: true, - async: true, - contentType: "application/x-www-form-urlencoded; charset=UTF-8", - - /* - timeout: 0, - data: null, - dataType: null, - username: null, - password: null, - cache: null, - throws: false, - traditional: false, - headers: {}, - */ - - accepts: { - "*": allTypes, - text: "text/plain", - html: "text/html", - xml: "application/xml, text/xml", - json: "application/json, text/javascript" - }, - - contents: { - xml: /\bxml\b/, - html: /\bhtml/, - json: /\bjson\b/ - }, - - responseFields: { - xml: "responseXML", - text: "responseText", - json: "responseJSON" - }, - - // Data converters - // Keys separate source (or catchall "*") and destination types with a single space - converters: { - - // Convert anything to text - "* text": String, - - // Text to html (true = no transformation) - "text html": true, - - // Evaluate text as a json expression - "text json": JSON.parse, - - // Parse text as xml - "text xml": jQuery.parseXML - }, - - // For options that shouldn't be deep extended: - // you can add your own custom options here if - // and when you create one that shouldn't be - // deep extended (see ajaxExtend) - flatOptions: { - url: true, - context: true - } - }, - - // Creates a full fledged settings object into target - // with both ajaxSettings and settings fields. - // If target is omitted, writes into ajaxSettings. - ajaxSetup: function( target, settings ) { - return settings ? - - // Building a settings object - ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : - - // Extending ajaxSettings - ajaxExtend( jQuery.ajaxSettings, target ); - }, - - ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), - ajaxTransport: addToPrefiltersOrTransports( transports ), - - // Main method - ajax: function( url, options ) { - - // If url is an object, simulate pre-1.5 signature - if ( typeof url === "object" ) { - options = url; - url = undefined; - } - - // Force options to be an object - options = options || {}; - - var transport, - - // URL without anti-cache param - cacheURL, - - // Response headers - responseHeadersString, - responseHeaders, - - // timeout handle - timeoutTimer, - - // Url cleanup var - urlAnchor, - - // Request state (becomes false upon send and true upon completion) - completed, - - // To know if global events are to be dispatched - fireGlobals, - - // Loop variable - i, - - // uncached part of the url - uncached, - - // Create the final options object - s = jQuery.ajaxSetup( {}, options ), - - // Callbacks context - callbackContext = s.context || s, - - // Context for global events is callbackContext if it is a DOM node or jQuery collection - globalEventContext = s.context && - ( callbackContext.nodeType || callbackContext.jquery ) ? - jQuery( callbackContext ) : - jQuery.event, - - // Deferreds - deferred = jQuery.Deferred(), - completeDeferred = jQuery.Callbacks( "once memory" ), - - // Status-dependent callbacks - statusCode = s.statusCode || {}, - - // Headers (they are sent all at once) - requestHeaders = {}, - requestHeadersNames = {}, - - // Default abort message - strAbort = "canceled", - - // Fake xhr - jqXHR = { - readyState: 0, - - // Builds headers hashtable if needed - getResponseHeader: function( key ) { - var match; - if ( completed ) { - if ( !responseHeaders ) { - responseHeaders = {}; - while ( ( match = rheaders.exec( responseHeadersString ) ) ) { - responseHeaders[ match[ 1 ].toLowerCase() ] = match[ 2 ]; - } - } - match = responseHeaders[ key.toLowerCase() ]; - } - return match == null ? null : match; - }, - - // Raw string - getAllResponseHeaders: function() { - return completed ? responseHeadersString : null; - }, - - // Caches the header - setRequestHeader: function( name, value ) { - if ( completed == null ) { - name = requestHeadersNames[ name.toLowerCase() ] = - requestHeadersNames[ name.toLowerCase() ] || name; - requestHeaders[ name ] = value; - } - return this; - }, - - // Overrides response content-type header - overrideMimeType: function( type ) { - if ( completed == null ) { - s.mimeType = type; - } - return this; - }, - - // Status-dependent callbacks - statusCode: function( map ) { - var code; - if ( map ) { - if ( completed ) { - - // Execute the appropriate callbacks - jqXHR.always( map[ jqXHR.status ] ); - } else { - - // Lazy-add the new callbacks in a way that preserves old ones - for ( code in map ) { - statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; - } - } - } - return this; - }, - - // Cancel the request - abort: function( statusText ) { - var finalText = statusText || strAbort; - if ( transport ) { - transport.abort( finalText ); - } - done( 0, finalText ); - return this; - } - }; - - // Attach deferreds - deferred.promise( jqXHR ); - - // Add protocol if not provided (prefilters might expect it) - // Handle falsy url in the settings object (#10093: consistency with old signature) - // We also use the url parameter if available - s.url = ( ( url || s.url || location.href ) + "" ) - .replace( rprotocol, location.protocol + "//" ); - - // Alias method option to type as per ticket #12004 - s.type = options.method || options.type || s.method || s.type; - - // Extract dataTypes list - s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; - - // A cross-domain request is in order when the origin doesn't match the current origin. - if ( s.crossDomain == null ) { - urlAnchor = document.createElement( "a" ); - - // Support: IE <=8 - 11, Edge 12 - 13 - // IE throws exception on accessing the href property if url is malformed, - // e.g. https://example.com:80x/ - try { - urlAnchor.href = s.url; - - // Support: IE <=8 - 11 only - // Anchor's host property isn't correctly set when s.url is relative - urlAnchor.href = urlAnchor.href; - s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== - urlAnchor.protocol + "//" + urlAnchor.host; - } catch ( e ) { - - // If there is an error parsing the URL, assume it is crossDomain, - // it can be rejected by the transport if it is invalid - s.crossDomain = true; - } - } - - // Convert data if not already a string - if ( s.data && s.processData && typeof s.data !== "string" ) { - s.data = jQuery.param( s.data, s.traditional ); - } - - // Apply prefilters - inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); - - // If request was aborted inside a prefilter, stop there - if ( completed ) { - return jqXHR; - } - - // We can fire global events as of now if asked to - // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) - fireGlobals = jQuery.event && s.global; - - // Watch for a new set of requests - if ( fireGlobals && jQuery.active++ === 0 ) { - jQuery.event.trigger( "ajaxStart" ); - } - - // Uppercase the type - s.type = s.type.toUpperCase(); - - // Determine if request has content - s.hasContent = !rnoContent.test( s.type ); - - // Save the URL in case we're toying with the If-Modified-Since - // and/or If-None-Match header later on - // Remove hash to simplify url manipulation - cacheURL = s.url.replace( rhash, "" ); - - // More options handling for requests with no content - if ( !s.hasContent ) { - - // Remember the hash so we can put it back - uncached = s.url.slice( cacheURL.length ); - - // If data is available, append data to url - if ( s.data ) { - cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; - - // #9682: remove data so that it's not used in an eventual retry - delete s.data; - } - - // Add or update anti-cache param if needed - if ( s.cache === false ) { - cacheURL = cacheURL.replace( rantiCache, "$1" ); - uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce++ ) + uncached; - } - - // Put hash and anti-cache on the URL that will be requested (gh-1732) - s.url = cacheURL + uncached; - - // Change '%20' to '+' if this is encoded form body content (gh-2658) - } else if ( s.data && s.processData && - ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { - s.data = s.data.replace( r20, "+" ); - } - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - if ( jQuery.lastModified[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); - } - if ( jQuery.etag[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); - } - } - - // Set the correct header, if data is being sent - if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { - jqXHR.setRequestHeader( "Content-Type", s.contentType ); - } - - // Set the Accepts header for the server, depending on the dataType - jqXHR.setRequestHeader( - "Accept", - s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? - s.accepts[ s.dataTypes[ 0 ] ] + - ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : - s.accepts[ "*" ] - ); - - // Check for headers option - for ( i in s.headers ) { - jqXHR.setRequestHeader( i, s.headers[ i ] ); - } - - // Allow custom headers/mimetypes and early abort - if ( s.beforeSend && - ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { - - // Abort if not done already and return - return jqXHR.abort(); - } - - // Aborting is no longer a cancellation - strAbort = "abort"; - - // Install callbacks on deferreds - completeDeferred.add( s.complete ); - jqXHR.done( s.success ); - jqXHR.fail( s.error ); - - // Get transport - transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); - - // If no transport, we auto-abort - if ( !transport ) { - done( -1, "No Transport" ); - } else { - jqXHR.readyState = 1; - - // Send global event - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); - } - - // If request was aborted inside ajaxSend, stop there - if ( completed ) { - return jqXHR; - } - - // Timeout - if ( s.async && s.timeout > 0 ) { - timeoutTimer = window.setTimeout( function() { - jqXHR.abort( "timeout" ); - }, s.timeout ); - } - - try { - completed = false; - transport.send( requestHeaders, done ); - } catch ( e ) { - - // Rethrow post-completion exceptions - if ( completed ) { - throw e; - } - - // Propagate others as results - done( -1, e ); - } - } - - // Callback for when everything is done - function done( status, nativeStatusText, responses, headers ) { - var isSuccess, success, error, response, modified, - statusText = nativeStatusText; - - // Ignore repeat invocations - if ( completed ) { - return; - } - - completed = true; - - // Clear timeout if it exists - if ( timeoutTimer ) { - window.clearTimeout( timeoutTimer ); - } - - // Dereference transport for early garbage collection - // (no matter how long the jqXHR object will be used) - transport = undefined; - - // Cache response headers - responseHeadersString = headers || ""; - - // Set readyState - jqXHR.readyState = status > 0 ? 4 : 0; - - // Determine if successful - isSuccess = status >= 200 && status < 300 || status === 304; - - // Get response data - if ( responses ) { - response = ajaxHandleResponses( s, jqXHR, responses ); - } - - // Convert no matter what (that way responseXXX fields are always set) - response = ajaxConvert( s, response, jqXHR, isSuccess ); - - // If successful, handle type chaining - if ( isSuccess ) { - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - modified = jqXHR.getResponseHeader( "Last-Modified" ); - if ( modified ) { - jQuery.lastModified[ cacheURL ] = modified; - } - modified = jqXHR.getResponseHeader( "etag" ); - if ( modified ) { - jQuery.etag[ cacheURL ] = modified; - } - } - - // if no content - if ( status === 204 || s.type === "HEAD" ) { - statusText = "nocontent"; - - // if not modified - } else if ( status === 304 ) { - statusText = "notmodified"; - - // If we have data, let's convert it - } else { - statusText = response.state; - success = response.data; - error = response.error; - isSuccess = !error; - } - } else { - - // Extract error from statusText and normalize for non-aborts - error = statusText; - if ( status || !statusText ) { - statusText = "error"; - if ( status < 0 ) { - status = 0; - } - } - } - - // Set data for the fake xhr object - jqXHR.status = status; - jqXHR.statusText = ( nativeStatusText || statusText ) + ""; - - // Success/Error - if ( isSuccess ) { - deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); - } else { - deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); - } - - // Status-dependent callbacks - jqXHR.statusCode( statusCode ); - statusCode = undefined; - - if ( fireGlobals ) { - globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", - [ jqXHR, s, isSuccess ? success : error ] ); - } - - // Complete - completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); - - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); - - // Handle the global AJAX counter - if ( !( --jQuery.active ) ) { - jQuery.event.trigger( "ajaxStop" ); - } - } - } - - return jqXHR; - }, - - getJSON: function( url, data, callback ) { - return jQuery.get( url, data, callback, "json" ); - }, - - getScript: function( url, callback ) { - return jQuery.get( url, undefined, callback, "script" ); - } -} ); - -jQuery.each( [ "get", "post" ], function( i, method ) { - jQuery[ method ] = function( url, data, callback, type ) { - - // Shift arguments if data argument was omitted - if ( jQuery.isFunction( data ) ) { - type = type || callback; - callback = data; - data = undefined; - } - - // The url can be an options object (which then must have .url) - return jQuery.ajax( jQuery.extend( { - url: url, - type: method, - dataType: type, - data: data, - success: callback - }, jQuery.isPlainObject( url ) && url ) ); - }; -} ); - - -jQuery._evalUrl = function( url ) { - return jQuery.ajax( { - url: url, - - // Make this explicit, since user can override this through ajaxSetup (#11264) - type: "GET", - dataType: "script", - cache: true, - async: false, - global: false, - "throws": true - } ); -}; - - -jQuery.fn.extend( { - wrapAll: function( html ) { - var wrap; - - if ( this[ 0 ] ) { - if ( jQuery.isFunction( html ) ) { - html = html.call( this[ 0 ] ); - } - - // The elements to wrap the target around - wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); - - if ( this[ 0 ].parentNode ) { - wrap.insertBefore( this[ 0 ] ); - } - - wrap.map( function() { - var elem = this; - - while ( elem.firstElementChild ) { - elem = elem.firstElementChild; - } - - return elem; - } ).append( this ); - } - - return this; - }, - - wrapInner: function( html ) { - if ( jQuery.isFunction( html ) ) { - return this.each( function( i ) { - jQuery( this ).wrapInner( html.call( this, i ) ); - } ); - } - - return this.each( function() { - var self = jQuery( this ), - contents = self.contents(); - - if ( contents.length ) { - contents.wrapAll( html ); - - } else { - self.append( html ); - } - } ); - }, - - wrap: function( html ) { - var isFunction = jQuery.isFunction( html ); - - return this.each( function( i ) { - jQuery( this ).wrapAll( isFunction ? html.call( this, i ) : html ); - } ); - }, - - unwrap: function( selector ) { - this.parent( selector ).not( "body" ).each( function() { - jQuery( this ).replaceWith( this.childNodes ); - } ); - return this; - } -} ); - - -jQuery.expr.pseudos.hidden = function( elem ) { - return !jQuery.expr.pseudos.visible( elem ); -}; -jQuery.expr.pseudos.visible = function( elem ) { - return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); -}; - - - - -jQuery.ajaxSettings.xhr = function() { - try { - return new window.XMLHttpRequest(); - } catch ( e ) {} -}; - -var xhrSuccessStatus = { - - // File protocol always yields status code 0, assume 200 - 0: 200, - - // Support: IE <=9 only - // #1450: sometimes IE returns 1223 when it should be 204 - 1223: 204 - }, - xhrSupported = jQuery.ajaxSettings.xhr(); - -support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); -support.ajax = xhrSupported = !!xhrSupported; - -jQuery.ajaxTransport( function( options ) { - var callback, errorCallback; - - // Cross domain only allowed if supported through XMLHttpRequest - if ( support.cors || xhrSupported && !options.crossDomain ) { - return { - send: function( headers, complete ) { - var i, - xhr = options.xhr(); - - xhr.open( - options.type, - options.url, - options.async, - options.username, - options.password - ); - - // Apply custom fields if provided - if ( options.xhrFields ) { - for ( i in options.xhrFields ) { - xhr[ i ] = options.xhrFields[ i ]; - } - } - - // Override mime type if needed - if ( options.mimeType && xhr.overrideMimeType ) { - xhr.overrideMimeType( options.mimeType ); - } - - // X-Requested-With header - // For cross-domain requests, seeing as conditions for a preflight are - // akin to a jigsaw puzzle, we simply never set it to be sure. - // (it can always be set on a per-request basis or even using ajaxSetup) - // For same-domain requests, won't change header if already provided. - if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { - headers[ "X-Requested-With" ] = "XMLHttpRequest"; - } - - // Set headers - for ( i in headers ) { - xhr.setRequestHeader( i, headers[ i ] ); - } - - // Callback - callback = function( type ) { - return function() { - if ( callback ) { - callback = errorCallback = xhr.onload = - xhr.onerror = xhr.onabort = xhr.onreadystatechange = null; - - if ( type === "abort" ) { - xhr.abort(); - } else if ( type === "error" ) { - - // Support: IE <=9 only - // On a manual native abort, IE9 throws - // errors on any property access that is not readyState - if ( typeof xhr.status !== "number" ) { - complete( 0, "error" ); - } else { - complete( - - // File: protocol always yields status 0; see #8605, #14207 - xhr.status, - xhr.statusText - ); - } - } else { - complete( - xhrSuccessStatus[ xhr.status ] || xhr.status, - xhr.statusText, - - // Support: IE <=9 only - // IE9 has no XHR2 but throws on binary (trac-11426) - // For XHR2 non-text, let the caller handle it (gh-2498) - ( xhr.responseType || "text" ) !== "text" || - typeof xhr.responseText !== "string" ? - { binary: xhr.response } : - { text: xhr.responseText }, - xhr.getAllResponseHeaders() - ); - } - } - }; - }; - - // Listen to events - xhr.onload = callback(); - errorCallback = xhr.onerror = callback( "error" ); - - // Support: IE 9 only - // Use onreadystatechange to replace onabort - // to handle uncaught aborts - if ( xhr.onabort !== undefined ) { - xhr.onabort = errorCallback; - } else { - xhr.onreadystatechange = function() { - - // Check readyState before timeout as it changes - if ( xhr.readyState === 4 ) { - - // Allow onerror to be called first, - // but that will not handle a native abort - // Also, save errorCallback to a variable - // as xhr.onerror cannot be accessed - window.setTimeout( function() { - if ( callback ) { - errorCallback(); - } - } ); - } - }; - } - - // Create the abort callback - callback = callback( "abort" ); - - try { - - // Do send the request (this may raise an exception) - xhr.send( options.hasContent && options.data || null ); - } catch ( e ) { - - // #14683: Only rethrow if this hasn't been notified as an error yet - if ( callback ) { - throw e; - } - } - }, - - abort: function() { - if ( callback ) { - callback(); - } - } - }; - } -} ); - - - - -// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) -jQuery.ajaxPrefilter( function( s ) { - if ( s.crossDomain ) { - s.contents.script = false; - } -} ); - -// Install script dataType -jQuery.ajaxSetup( { - accepts: { - script: "text/javascript, application/javascript, " + - "application/ecmascript, application/x-ecmascript" - }, - contents: { - script: /\b(?:java|ecma)script\b/ - }, - converters: { - "text script": function( text ) { - jQuery.globalEval( text ); - return text; - } - } -} ); - -// Handle cache's special case and crossDomain -jQuery.ajaxPrefilter( "script", function( s ) { - if ( s.cache === undefined ) { - s.cache = false; - } - if ( s.crossDomain ) { - s.type = "GET"; - } -} ); - -// Bind script tag hack transport -jQuery.ajaxTransport( "script", function( s ) { - - // This transport only deals with cross domain requests - if ( s.crossDomain ) { - var script, callback; - return { - send: function( _, complete ) { - script = jQuery( " -{%- endblock %} - - -{% block footer %} -{{ super() }} - - - - - - - -{% endblock %} diff --git a/advanced_source/ONNXLive.rst b/advanced_source/ONNXLive.rst index 21380e43405..7177522c968 100644 --- a/advanced_source/ONNXLive.rst +++ b/advanced_source/ONNXLive.rst @@ -2,172 +2,11 @@ ONNX Live Tutorial ================== -This tutorial will show you to convert a neural style transfer model that has been exported from PyTorch into the Apple CoreML format using ONNX. This will allow you to easily run deep learning models on Apple devices and, in this case, live stream from the camera. +This tutorial has been deprecated. -What is ONNX? -------------- +Redirecting in 3 seconds... -ONNX (Open Neural Network Exchange) is an open format to represent deep learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them. ONNX is developed and supported by a community of partners. You can learn more about ONNX and what tools are supported by going to `onnx.ai `_. -Tutorial Overview ------------------ +.. raw:: html -This tutorial will walk you through 4 main steps: - - -#. `Download (or train) PyTorch style transfer models`_ -#. `Convert the PyTorch models to ONNX models`_ -#. `Convert the ONNX models to CoreML models`_ -#. `Run the CoreML models in a style transfer iOS App`_ - -Preparing the Environment -------------------------- - -We will be working in a virtualenv in order to avoid conflicts with your local packages. -We are also using Python 3.6 for this tutorial, but other versions should work as well. - -.. code-block:: python - - python3.6 -m venv venv - source ./venv/bin/activate - - -You need to install pytorch and the onnx->coreml converter: - -.. code-block:: bash - - pip install torchvision onnx-coreml - - -You will also need to install XCode if you want to run the iOS style transfer app on your iPhone. -You can also convert models in Linux, however to run the iOS app itself, you will need a Mac. - -Download (or train) PyTorch style transfer models -------------------------------------------------- - -For this tutorial, we will use the style transfer models that are published with pytorch in https://github.com/pytorch/examples/tree/master/fast_neural_style . -If you would like to use a different PyTorch or ONNX model, feel free to skip this step. - -These models are meant for applying style transfer on still images and really not optimized to be fast enough for video. However if we reduce the resolution low enough, they can also work well on videos. - -Let's download the models: - -.. code-block:: bash - - git clone https://github.com/pytorch/examples - cd examples/fast_neural_style - - -If you would like to train the models yourself, the pytorch/examples repository you just cloned has more information on how to do this. -For now, we'll just download pre-trained models with the script provided by the repository: - -.. code-block:: bash - - python download_saved_models.py - - -This script downloads the pre-trained PyTorch models and puts them into the ``saved_models`` folder. -There should now be 4 files, ``candy.pth``\ , ``mosaic.pth``\ , ``rain_princess.pth`` and ``udnie.pth`` in your directory. - -Convert the PyTorch models to ONNX models ------------------------------------------ - -Now that we have the pre-trained PyTorch models as ``.pth`` files in the ``saved_models`` folder, we will need to convert them to ONNX format. -The model definition is in the pytorch/examples repository we cloned previously, and with a few lines of python we can export it to ONNX. -In this case, instead of actually running the neural net, we will call ``torch.onnx._export``\ , which is provided with PyTorch as an api to directly export ONNX formatted models from PyTorch. -However, in this case we don't even need to do that, because a script already exists ``neural_style/neural_style.py`` that will do this for us. -You can also take a look at that script if you would like to apply it to other models. - -Exporting the ONNX format from PyTorch is essentially tracing your neural network so this api call will internally run the network on 'dummy data' in order to generate the graph. -For this, it needs an input image to apply the style transfer to which can simply be a blank image. -However, the pixel size of this image is important, as this will be the size for the exported style transfer model. -To get good performance, we'll use a resolution of 250x540. Feel free to take a larger resolution if you care less about -FPS and more about style transfer quality. - -Let's use `ImageMagick `_ to create a blank image of the resolution we want: - -.. code-block:: bash - - convert -size 250x540 xc:white png24:dummy.jpg - - -and use that to export the PyTorch models: - -.. code-block:: bash - - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/candy.pth --cuda 0 --export_onnx ./saved_models/candy.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/udnie.pth --cuda 0 --export_onnx ./saved_models/udnie.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/rain_princess.pth --cuda 0 --export_onnx ./saved_models/rain_princess.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/mosaic.pth --cuda 0 --export_onnx ./saved_models/mosaic.onnx - - -You should end up with 4 files, ``candy.onnx``\ , ``mosaic.onnx``\ , ``rain_princess.onnx`` and ``udnie.onnx``\ , -created from the corresponding ``.pth`` files. - -Convert the ONNX models to CoreML models ----------------------------------------- - -Now that we have ONNX models, we can convert them to CoreML models in order to run them on Apple devices. -For this, we use the onnx-coreml converter we installed previously. -The converter comes with a ``convert-onnx-to-coreml`` script, which the installation steps above added to our path. Unfortunately that won't work for us as we need to mark the input and output of the network as an image -and, while this is supported by the converter, it is only supported when calling the converter from python. - -Looking at the style transfer model (for example opening the .onnx file in an application like `Netron `_\ ), -we see that the input is named '0' and the output is named '186'. These are just numeric ids assigned by PyTorch. -We will need to mark these as images. - -So let's create a small python file and call it ``onnx_to_coreml.py``. This can be created by using the touch command and edited with your favorite editor to add the following lines of code. - -.. code-block:: python - - import sys - from onnx import onnx_pb - from onnx_coreml import convert - - model_in = sys.argv[1] - model_out = sys.argv[2] - - model_file = open(model_in, 'rb') - model_proto = onnx_pb.ModelProto() - model_proto.ParseFromString(model_file.read()) - coreml_model = convert(model_proto, image_input_names=['0'], image_output_names=['186']) - coreml_model.save(model_out) - - -we now run it: - -.. code-block:: bash - - python onnx_to_coreml.py ./saved_models/candy.onnx ./saved_models/candy.mlmodel - python onnx_to_coreml.py ./saved_models/udnie.onnx ./saved_models/udnie.mlmodel - python onnx_to_coreml.py ./saved_models/rain_princess.onnx ./saved_models/rain_princess.mlmodel - python onnx_to_coreml.py ./saved_models/mosaic.onnx ./saved_models/mosaic.mlmodel - - -Now, there should be 4 CoreML models in your ``saved_models`` directory: ``candy.mlmodel``\ , ``mosaic.mlmodel``\ , ``rain_princess.mlmodel`` and ``udnie.mlmodel``. - -Run the CoreML models in a style transfer iOS App -------------------------------------------------- - -This repository (i.e. the one you're currently reading the README.md of) contains an iOS app able to run CoreML style transfer models on a live camera stream from your phone camera. Let's clone the repository: - -.. code-block:: bash - - git clone https://github.com/onnx/tutorials - - -and open the ``tutorials/examples/CoreML/ONNXLive/ONNXLive.xcodeproj`` project in XCode. -We recommend using XCode 9.3 and an iPhone X. There might be issues running on older devices or XCode versions. - -In the ``Models/`` folder, the project contains some .mlmodel files. We're going to replace them with the models we just created. - -You then run the app on your iPhone and you are all set. Tapping on the screen switches through the models. - -Conclusion ----------- - -We hope this tutorial gave you an overview of what ONNX is about and how you can use it to convert neural networks -between frameworks, in this case neural style transfer models moving from PyTorch to CoreML. - -Feel free to experiment with these steps and test them on your own models. -Please let us know if you hit any issues or want to give feedback. We'd like to hear what you think. + diff --git a/advanced_source/README.txt b/advanced_source/README.txt index 0dbaffef5f7..56f01688089 100644 --- a/advanced_source/README.txt +++ b/advanced_source/README.txt @@ -8,11 +8,3 @@ Advanced Tutorials 2. numpy_extensions_tutorial.py Creating Extensions Using numpy and scipy https://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html - -3. c_extension.rst - Custom C Extensions for PyTorch - https://pytorch.org/tutorials/advanced/c_extension.html - -4. super_resolution_with_onnxruntime.py - Exporting a Model from PyTorch to ONNX and Running it using ONNXRuntime - https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py index 7dd3acf238d..90ea4565dab 100644 --- a/advanced_source/coding_ddpg.py +++ b/advanced_source/coding_ddpg.py @@ -182,7 +182,7 @@ # Later, we will see how the target parameters should be updated in TorchRL. # -from tensordict.nn import TensorDictModule +from tensordict.nn import TensorDictModule, TensorDictSequential def _init( @@ -290,12 +290,11 @@ def _loss_actor( ) -> torch.Tensor: td_copy = tensordict.select(*self.actor_in_keys) # Get an action from the actor network: since we made it functional, we need to pass the params - td_copy = self.actor_network(td_copy, params=self.actor_network_params) + with self.actor_network_params.to_module(self.actor_network): + td_copy = self.actor_network(td_copy) # get the value associated with that action - td_copy = self.value_network( - td_copy, - params=self.value_network_params.detach(), - ) + with self.value_network_params.detach().to_module(self.value_network): + td_copy = self.value_network(td_copy) return -td_copy.get("state_action_value") @@ -317,7 +316,8 @@ def _loss_value( td_copy = tensordict.clone() # V(s, a) - self.value_network(td_copy, params=self.value_network_params) + with self.value_network_params.to_module(self.value_network): + self.value_network(td_copy) pred_val = td_copy.get("state_action_value").squeeze(-1) # we manually reconstruct the parameters of the actor-critic, where the first @@ -332,9 +332,8 @@ def _loss_value( batch_size=self.target_actor_network_params.batch_size, device=self.target_actor_network_params.device, ) - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) + with target_params.to_module(self.actor_critic): + target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1) # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function` loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function) @@ -717,7 +716,7 @@ def get_env_stats(): ActorCriticWrapper, DdpgMlpActor, DdpgMlpQNet, - OrnsteinUhlenbeckProcessWrapper, + OrnsteinUhlenbeckProcessModule, ProbabilisticActor, TanhDelta, ValueOperator, @@ -776,15 +775,18 @@ def make_ddpg_actor( # Exploration # ~~~~~~~~~~~ # -# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper` +# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule` # exploration module, as suggested in the original paper. # Let's define the number of frames before OU noise reaches its minimum value annealing_frames = 1_000_000 -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( +actor_model_explore = TensorDictSequential( actor, - annealing_num_steps=annealing_frames, -).to(device) + OrnsteinUhlenbeckProcessModule( + spec=actor.spec.clone(), + annealing_num_steps=annealing_frames, + ).to(device), +) if device == torch.device("cpu"): actor_model_explore.share_memory() @@ -891,7 +893,7 @@ def make_recorder(actor_model_explore, transform_state_dict, record_interval): record_frames=1000, policy_exploration=actor_model_explore, environment=environment, - exploration_type=ExplorationType.MEAN, + exploration_type=ExplorationType.DETERMINISTIC, record_interval=record_interval, ) return recorder_obj @@ -1038,7 +1040,7 @@ def ceil_div(x, y): ############################################################################### # let's use the TD(lambda) estimator! -loss_module.make_value_estimator(ValueEstimators.TDLambda, gamma=gamma, lmbda=lmbda) +loss_module.make_value_estimator(ValueEstimators.TDLambda, gamma=gamma, lmbda=lmbda, device=device) ############################################################################### # .. note:: @@ -1168,7 +1170,7 @@ def ceil_div(x, y): ) # update the exploration strategy - actor_model_explore.step(current_frames) + actor_model_explore[1].step(current_frames) collector.shutdown() del collector diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst index d09f877e5a2..51e5e0b358f 100644 --- a/advanced_source/cpp_autograd.rst +++ b/advanced_source/cpp_autograd.rst @@ -255,9 +255,9 @@ Out: [ CPUFloatType{3,4} ] Please see the documentation for ``torch::autograd::backward`` -(`link `_) +(`link `_) and ``torch::autograd::grad`` -(`link `_) +(`link `_) for more information on how to use them. Using custom autograd function in C++ @@ -394,9 +394,9 @@ C++ using the following table: +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Python | C++ | +================================+========================================================================================================================================================================+ -| ``torch.autograd.backward`` | ``torch::autograd::backward`` (`link `_) | +| ``torch.autograd.backward`` | ``torch::autograd::backward`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ``torch.autograd.grad`` | ``torch::autograd::grad`` (`link `_) | +| ``torch.autograd.grad`` | ``torch::autograd::grad`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``torch.Tensor.detach`` | ``torch::Tensor::detach`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/advanced_source/cpp_cuda_graphs.rst b/advanced_source/cpp_cuda_graphs.rst deleted file mode 100644 index 494d6426d47..00000000000 --- a/advanced_source/cpp_cuda_graphs.rst +++ /dev/null @@ -1,193 +0,0 @@ -Using CUDA Graphs in PyTorch C++ API -==================================== - -.. note:: - |edit| View and edit this tutorial in `GitHub `__. The full source code is available on `GitHub `__. - -Prerequisites: - -- `Using the PyTorch C++ Frontend <../advanced_source/cpp_frontend.html>`__ -- `CUDA semantics `__ -- Pytorch 2.0 or later -- CUDA 11 or later - -NVIDIA’s CUDA Graphs have been a part of CUDA Toolkit library since the -release of `version 10 `_. -They are capable of greatly reducing the CPU overhead increasing the -performance of applications. - -In this tutorial, we will be focusing on using CUDA Graphs for `C++ -frontend of PyTorch `_. -The C++ frontend is mostly utilized in production and deployment applications which -are important parts of PyTorch use cases. Since `the first appearance -`_ -the CUDA Graphs won users’ and developer’s hearts for being a very performant -and at the same time simple-to-use tool. In fact, CUDA Graphs are used by default -in ``torch.compile`` of PyTorch 2.0 to boost the productivity of training and inference. - -We would like to demonstrate CUDA Graphs usage on PyTorch’s `MNIST -example `_. -The usage of CUDA Graphs in LibTorch (C++ Frontend) is very similar to its -`Python counterpart `_ -but with some differences in syntax and functionality. - -Getting Started ---------------- - -The main training loop consists of the several steps and depicted in the -following code chunk: - -.. code-block:: cpp - - for (auto& batch : data_loader) { - auto data = batch.data.to(device); - auto targets = batch.target.to(device); - optimizer.zero_grad(); - auto output = model.forward(data); - auto loss = torch::nll_loss(output, targets); - loss.backward(); - optimizer.step(); - } - -The example above includes a forward pass, a backward pass, and weight updates. - -In this tutorial, we will be applying CUDA Graph on all the compute steps through the whole-network -graph capture. But before doing so, we need to slightly modify the source code. What we need -to do is preallocate tensors for reusing them in the main training loop. Here is an example -implementation: - -.. code-block:: cpp - - torch::TensorOptions FloatCUDA = - torch::TensorOptions(device).dtype(torch::kFloat); - torch::TensorOptions LongCUDA = - torch::TensorOptions(device).dtype(torch::kLong); - - torch::Tensor data = torch::zeros({kTrainBatchSize, 1, 28, 28}, FloatCUDA); - torch::Tensor targets = torch::zeros({kTrainBatchSize}, LongCUDA); - torch::Tensor output = torch::zeros({1}, FloatCUDA); - torch::Tensor loss = torch::zeros({1}, FloatCUDA); - - for (auto& batch : data_loader) { - data.copy_(batch.data); - targets.copy_(batch.target); - training_step(model, optimizer, data, targets, output, loss); - } - -Where ``training_step`` simply consists of forward and backward passes with corresponding optimizer calls: - -.. code-block:: cpp - - void training_step( - Net& model, - torch::optim::Optimizer& optimizer, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss) { - optimizer.zero_grad(); - output = model.forward(data); - loss = torch::nll_loss(output, targets); - loss.backward(); - optimizer.step(); - } - -PyTorch’s CUDA Graphs API is relying on Stream Capture which in our case would be used like this: - -.. code-block:: cpp - - at::cuda::CUDAGraph graph; - at::cuda::CUDAStream captureStream = at::cuda::getStreamFromPool(); - at::cuda::setCurrentCUDAStream(captureStream); - - graph.capture_begin(); - training_step(model, optimizer, data, targets, output, loss); - graph.capture_end(); - -Before the actual graph capture, it is important to run several warm-up iterations on side stream to -prepare CUDA cache as well as CUDA libraries (like CUBLAS and CUDNN) that will be used during -the training: - -.. code-block:: cpp - - at::cuda::CUDAStream warmupStream = at::cuda::getStreamFromPool(); - at::cuda::setCurrentCUDAStream(warmupStream); - for (int iter = 0; iter < num_warmup_iters; iter++) { - training_step(model, optimizer, data, targets, output, loss); - } - -After the successful graph capture, we can replace ``training_step(model, optimizer, data, targets, output, loss);`` -call via ``graph.replay();`` to do the training step. - -Training Results ----------------- - -Taking the code for a spin we can see the following output from ordinary non-graphed training: - -.. code-block:: shell - - $ time ./mnist - Train Epoch: 1 [59584/60000] Loss: 0.3921 - Test set: Average loss: 0.2051 | Accuracy: 0.938 - Train Epoch: 2 [59584/60000] Loss: 0.1826 - Test set: Average loss: 0.1273 | Accuracy: 0.960 - Train Epoch: 3 [59584/60000] Loss: 0.1796 - Test set: Average loss: 0.1012 | Accuracy: 0.968 - Train Epoch: 4 [59584/60000] Loss: 0.1603 - Test set: Average loss: 0.0869 | Accuracy: 0.973 - Train Epoch: 5 [59584/60000] Loss: 0.2315 - Test set: Average loss: 0.0736 | Accuracy: 0.978 - Train Epoch: 6 [59584/60000] Loss: 0.0511 - Test set: Average loss: 0.0704 | Accuracy: 0.977 - Train Epoch: 7 [59584/60000] Loss: 0.0802 - Test set: Average loss: 0.0654 | Accuracy: 0.979 - Train Epoch: 8 [59584/60000] Loss: 0.0774 - Test set: Average loss: 0.0604 | Accuracy: 0.980 - Train Epoch: 9 [59584/60000] Loss: 0.0669 - Test set: Average loss: 0.0544 | Accuracy: 0.984 - Train Epoch: 10 [59584/60000] Loss: 0.0219 - Test set: Average loss: 0.0517 | Accuracy: 0.983 - - real 0m44.287s - user 0m44.018s - sys 0m1.116s - -While the training with the CUDA Graph produces the following output: - -.. code-block:: shell - - $ time ./mnist --use-train-graph - Train Epoch: 1 [59584/60000] Loss: 0.4092 - Test set: Average loss: 0.2037 | Accuracy: 0.938 - Train Epoch: 2 [59584/60000] Loss: 0.2039 - Test set: Average loss: 0.1274 | Accuracy: 0.961 - Train Epoch: 3 [59584/60000] Loss: 0.1779 - Test set: Average loss: 0.1017 | Accuracy: 0.968 - Train Epoch: 4 [59584/60000] Loss: 0.1559 - Test set: Average loss: 0.0871 | Accuracy: 0.972 - Train Epoch: 5 [59584/60000] Loss: 0.2240 - Test set: Average loss: 0.0735 | Accuracy: 0.977 - Train Epoch: 6 [59584/60000] Loss: 0.0520 - Test set: Average loss: 0.0710 | Accuracy: 0.978 - Train Epoch: 7 [59584/60000] Loss: 0.0935 - Test set: Average loss: 0.0666 | Accuracy: 0.979 - Train Epoch: 8 [59584/60000] Loss: 0.0744 - Test set: Average loss: 0.0603 | Accuracy: 0.981 - Train Epoch: 9 [59584/60000] Loss: 0.0762 - Test set: Average loss: 0.0547 | Accuracy: 0.983 - Train Epoch: 10 [59584/60000] Loss: 0.0207 - Test set: Average loss: 0.0525 | Accuracy: 0.983 - - real 0m6.952s - user 0m7.048s - sys 0m0.619s - -Conclusion ----------- - -As we can see, just by applying a CUDA Graph on the `MNIST example -`_ we were able to gain the performance -by more than six times for training. This kind of large performance improvement was achievable due to -the small model size. In case of larger models with heavy GPU usage, the CPU overhead is less impactful -so the improvement will be smaller. Nevertheless, it is always advantageous to use CUDA Graphs to -gain the performance of GPUs. diff --git a/advanced_source/cpp_cuda_graphs/CMakeLists.txt b/advanced_source/cpp_cuda_graphs/CMakeLists.txt deleted file mode 100644 index 76fc5bc6762..00000000000 --- a/advanced_source/cpp_cuda_graphs/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(mnist) -set(CMAKE_CXX_STANDARD 17) - -find_package(Torch REQUIRED) -find_package(Threads REQUIRED) - -option(DOWNLOAD_MNIST "Download the MNIST dataset from the internet" ON) -if (DOWNLOAD_MNIST) - message(STATUS "Downloading MNIST dataset") - execute_process( - COMMAND python ${CMAKE_CURRENT_LIST_DIR}/../tools/download_mnist.py - -d ${CMAKE_BINARY_DIR}/data - ERROR_VARIABLE DOWNLOAD_ERROR) - if (DOWNLOAD_ERROR) - message(FATAL_ERROR "Error downloading MNIST dataset: ${DOWNLOAD_ERROR}") - endif() -endif() - -add_executable(mnist mnist.cpp) -target_compile_features(mnist PUBLIC cxx_range_for) -target_link_libraries(mnist ${TORCH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) - -if (MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - add_custom_command(TARGET mnist - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${TORCH_DLLS} - $) -endif (MSVC) diff --git a/advanced_source/cpp_cuda_graphs/README.md b/advanced_source/cpp_cuda_graphs/README.md deleted file mode 100644 index cbe368d1e90..00000000000 --- a/advanced_source/cpp_cuda_graphs/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# MNIST Example with the PyTorch C++ Frontend - -This folder contains an example of training a computer vision model to recognize -digits in images from the MNIST dataset, using the PyTorch C++ frontend. - -The entire training code is contained in `mnist.cpp`. - -To build the code, run the following commands from your terminal: - -```shell -$ cd mnist -$ mkdir build -$ cd build -$ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch .. -$ make -``` - -where `/path/to/libtorch` should be the path to the unzipped _LibTorch_ -distribution, which you can get from the [PyTorch -homepage](https://pytorch.org/get-started/locally/). - -Execute the compiled binary to train the model: - -```shell -$ ./mnist -Train Epoch: 1 [59584/60000] Loss: 0.4232 -Test set: Average loss: 0.1989 | Accuracy: 0.940 -Train Epoch: 2 [59584/60000] Loss: 0.1926 -Test set: Average loss: 0.1338 | Accuracy: 0.959 -Train Epoch: 3 [59584/60000] Loss: 0.1390 -Test set: Average loss: 0.0997 | Accuracy: 0.969 -Train Epoch: 4 [59584/60000] Loss: 0.1239 -Test set: Average loss: 0.0875 | Accuracy: 0.972 -... -``` - -For running with CUDA Graphs add `--use-train-graph` and/or `--use-test-graph` -for training and testing passes respectively. diff --git a/advanced_source/cpp_cuda_graphs/mnist.cpp b/advanced_source/cpp_cuda_graphs/mnist.cpp deleted file mode 100644 index 97c5fb80ca0..00000000000 --- a/advanced_source/cpp_cuda_graphs/mnist.cpp +++ /dev/null @@ -1,372 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -// Where to find the MNIST dataset. -const char* kDataRoot = "./data"; - -// The batch size for training. -const int64_t kTrainBatchSize = 64; - -// The batch size for testing. -const int64_t kTestBatchSize = 1000; - -// The number of epochs to train. -const int64_t kNumberOfEpochs = 10; - -// After how many batches to log a new update with the loss value. -const int64_t kLogInterval = 10; - -// Model that we will be training -struct Net : torch::nn::Module { - Net() - : conv1(torch::nn::Conv2dOptions(1, 10, /*kernel_size=*/5)), - conv2(torch::nn::Conv2dOptions(10, 20, /*kernel_size=*/5)), - fc1(320, 50), - fc2(50, 10) { - register_module("conv1", conv1); - register_module("conv2", conv2); - register_module("conv2_drop", conv2_drop); - register_module("fc1", fc1); - register_module("fc2", fc2); - } - - torch::Tensor forward(torch::Tensor x) { - x = torch::relu(torch::max_pool2d(conv1->forward(x), 2)); - x = torch::relu( - torch::max_pool2d(conv2_drop->forward(conv2->forward(x)), 2)); - x = x.view({-1, 320}); - x = torch::relu(fc1->forward(x)); - x = torch::dropout(x, /*p=*/0.5, /*training=*/is_training()); - x = fc2->forward(x); - return torch::log_softmax(x, /*dim=*/1); - } - - torch::nn::Conv2d conv1; - torch::nn::Conv2d conv2; - torch::nn::Dropout2d conv2_drop; - torch::nn::Linear fc1; - torch::nn::Linear fc2; -}; - -void stream_sync( - at::cuda::CUDAStream& dependency, - at::cuda::CUDAStream& dependent) { - at::cuda::CUDAEvent cuda_ev; - cuda_ev.record(dependency); - cuda_ev.block(dependent); -} - -void training_step( - Net& model, - torch::optim::Optimizer& optimizer, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss) { - optimizer.zero_grad(); - output = model.forward(data); - loss = torch::nll_loss(output, targets); - loss.backward(); - optimizer.step(); -} - -void capture_train_graph( - Net& model, - torch::optim::Optimizer& optimizer, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss, - at::cuda::CUDAGraph& graph, - const short num_warmup_iters = 7) { - model.train(); - - auto warmupStream = at::cuda::getStreamFromPool(); - auto captureStream = at::cuda::getStreamFromPool(); - auto legacyStream = at::cuda::getCurrentCUDAStream(); - - at::cuda::setCurrentCUDAStream(warmupStream); - - stream_sync(legacyStream, warmupStream); - - for (C10_UNUSED const auto iter : c10::irange(num_warmup_iters)) { - training_step(model, optimizer, data, targets, output, loss); - } - - stream_sync(warmupStream, captureStream); - at::cuda::setCurrentCUDAStream(captureStream); - - graph.capture_begin(); - training_step(model, optimizer, data, targets, output, loss); - graph.capture_end(); - - stream_sync(captureStream, legacyStream); - at::cuda::setCurrentCUDAStream(legacyStream); -} - -template -void train( - size_t epoch, - Net& model, - torch::Device device, - DataLoader& data_loader, - torch::optim::Optimizer& optimizer, - size_t dataset_size, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss, - at::cuda::CUDAGraph& graph, - bool use_graph) { - model.train(); - - size_t batch_idx = 0; - - for (const auto& batch : data_loader) { - if (batch.data.size(0) != kTrainBatchSize || - batch.target.size(0) != kTrainBatchSize) { - continue; - } - - data.copy_(batch.data); - targets.copy_(batch.target); - - if (use_graph) { - graph.replay(); - } else { - training_step(model, optimizer, data, targets, output, loss); - } - - if (batch_idx++ % kLogInterval == 0) { - float train_loss = loss.item(); - std::cout << "\rTrain Epoch:" << epoch << " [" - << batch_idx * batch.data.size(0) << "/" << dataset_size - << "] Loss: " << train_loss; - } - } -} - -void test_step( - Net& model, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss) { - output = model.forward(data); - loss = torch::nll_loss(output, targets, {}, torch::Reduction::Sum); -} - -void capture_test_graph( - Net& model, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss, - torch::Tensor& total_loss, - torch::Tensor& total_correct, - at::cuda::CUDAGraph& graph, - const int num_warmup_iters = 7) { - torch::NoGradGuard no_grad; - model.eval(); - - auto warmupStream = at::cuda::getStreamFromPool(); - auto captureStream = at::cuda::getStreamFromPool(); - auto legacyStream = at::cuda::getCurrentCUDAStream(); - - at::cuda::setCurrentCUDAStream(warmupStream); - stream_sync(captureStream, legacyStream); - - for (C10_UNUSED const auto iter : c10::irange(num_warmup_iters)) { - test_step(model, data, targets, output, loss); - total_loss += loss; - total_correct += output.argmax(1).eq(targets).sum(); - } - - stream_sync(warmupStream, captureStream); - at::cuda::setCurrentCUDAStream(captureStream); - - graph.capture_begin(); - test_step(model, data, targets, output, loss); - graph.capture_end(); - - stream_sync(captureStream, legacyStream); - at::cuda::setCurrentCUDAStream(legacyStream); -} - -template -void test( - Net& model, - torch::Device device, - DataLoader& data_loader, - size_t dataset_size, - torch::Tensor& data, - torch::Tensor& targets, - torch::Tensor& output, - torch::Tensor& loss, - torch::Tensor& total_loss, - torch::Tensor& total_correct, - at::cuda::CUDAGraph& graph, - bool use_graph) { - torch::NoGradGuard no_grad; - - model.eval(); - loss.zero_(); - total_loss.zero_(); - total_correct.zero_(); - - for (const auto& batch : data_loader) { - if (batch.data.size(0) != kTestBatchSize || - batch.target.size(0) != kTestBatchSize) { - continue; - } - data.copy_(batch.data); - targets.copy_(batch.target); - - if (use_graph) { - graph.replay(); - } else { - test_step(model, data, targets, output, loss); - } - total_loss += loss; - total_correct += output.argmax(1).eq(targets).sum(); - } - - float test_loss = total_loss.item() / dataset_size; - float test_accuracy = - static_cast(total_correct.item()) / dataset_size; - - std::cout << std::endl - << "Test set: Average loss: " << test_loss - << " | Accuracy: " << test_accuracy << std::endl; -} - -int main(int argc, char* argv[]) { - if (!torch::cuda::is_available()) { - std::cout << "CUDA is not available!" << std::endl; - return -1; - } - - bool use_train_graph = false; - bool use_test_graph = false; - - std::vector arguments(argv + 1, argv + argc); - for (std::string& arg : arguments) { - if (arg == "--use-train-graph") { - std::cout << "Using CUDA Graph for training." << std::endl; - use_train_graph = true; - } - if (arg == "--use-test-graph") { - std::cout << "Using CUDA Graph for testing." << std::endl; - use_test_graph = true; - } - } - - torch::manual_seed(1); - torch::cuda::manual_seed(1); - torch::Device device(torch::kCUDA); - - Net model; - model.to(device); - - auto train_dataset = - torch::data::datasets::MNIST(kDataRoot) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - const size_t train_dataset_size = train_dataset.size().value(); - auto train_loader = - torch::data::make_data_loader( - std::move(train_dataset), kTrainBatchSize); - - auto test_dataset = - torch::data::datasets::MNIST( - kDataRoot, torch::data::datasets::MNIST::Mode::kTest) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - const size_t test_dataset_size = test_dataset.size().value(); - auto test_loader = - torch::data::make_data_loader(std::move(test_dataset), kTestBatchSize); - - torch::optim::SGD optimizer( - model.parameters(), torch::optim::SGDOptions(0.01).momentum(0.5)); - - torch::TensorOptions FloatCUDA = - torch::TensorOptions(device).dtype(torch::kFloat); - torch::TensorOptions LongCUDA = - torch::TensorOptions(device).dtype(torch::kLong); - - torch::Tensor train_data = - torch::zeros({kTrainBatchSize, 1, 28, 28}, FloatCUDA); - torch::Tensor train_targets = torch::zeros({kTrainBatchSize}, LongCUDA); - torch::Tensor train_output = torch::zeros({1}, FloatCUDA); - torch::Tensor train_loss = torch::zeros({1}, FloatCUDA); - - torch::Tensor test_data = - torch::zeros({kTestBatchSize, 1, 28, 28}, FloatCUDA); - torch::Tensor test_targets = torch::zeros({kTestBatchSize}, LongCUDA); - torch::Tensor test_output = torch::zeros({1}, FloatCUDA); - torch::Tensor test_loss = torch::zeros({1}, FloatCUDA); - torch::Tensor test_total_loss = torch::zeros({1}, FloatCUDA); - torch::Tensor test_total_correct = torch::zeros({1}, LongCUDA); - - at::cuda::CUDAGraph train_graph; - at::cuda::CUDAGraph test_graph; - - capture_train_graph( - model, - optimizer, - train_data, - train_targets, - train_output, - train_loss, - train_graph); - - capture_test_graph( - model, - test_data, - test_targets, - test_output, - test_loss, - test_total_loss, - test_total_correct, - test_graph); - - for (size_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) { - train( - epoch, - model, - device, - *train_loader, - optimizer, - train_dataset_size, - train_data, - train_targets, - train_output, - train_loss, - train_graph, - use_train_graph); - test( - model, - device, - *test_loader, - test_dataset_size, - test_data, - test_targets, - test_output, - test_loss, - test_total_loss, - test_total_correct, - test_graph, - use_test_graph); - } - - std::cout << " Training/testing complete" << std::endl; - return 0; -} diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst index fa56a0cc219..512c39b2a68 100644 --- a/advanced_source/cpp_custom_ops.rst +++ b/advanced_source/cpp_custom_ops.rst @@ -19,6 +19,10 @@ Custom C++ and CUDA Operators * PyTorch 2.4 or later * Basic understanding of C++ and CUDA programming +.. note:: + + This tutorial will also work on AMD ROCm with no additional modifications. + PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc). However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the blessed path to authoring a custom operator written in C++/CUDA. @@ -58,14 +62,92 @@ Using ``cpp_extension`` is as simple as writing the following ``setup.py``: setup(name="extension_cpp", ext_modules=[ - cpp_extension.CppExtension("extension_cpp", ["muladd.cpp"])], - cmdclass={'build_ext': cpp_extension.BuildExtension}) + cpp_extension.CppExtension( + "extension_cpp", + ["muladd.cpp"], + # define Py_LIMITED_API with min version 3.9 to expose only the stable + # limited API subset from Python.h + extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]}, + py_limited_api=True)], # Build 1 wheel across multiple Python versions + cmdclass={'build_ext': cpp_extension.BuildExtension}, + options={"bdist_wheel": {"py_limited_api": "cp39"}} # 3.9 is minimum supported Python version + ) If you need to compile CUDA code (for example, ``.cu`` files), then instead use `torch.utils.cpp_extension.CUDAExtension `_. -Please see how -`extension-cpp `_ for an example for -how this is set up. +Please see `extension-cpp `_ for an +example for how this is set up. + +The above example represents what we refer to as a CPython agnostic wheel, meaning we are +building a single wheel that can be run across multiple CPython versions (similar to pure +Python packages). CPython agnosticism is desirable in minimizing the number of wheels your +custom library needs to support and release. The minimum version we'd like to support is +3.9, since it is the oldest supported version currently, so we use the corresponding hexcode +and specifier throughout the setup code. We suggest building the extension in the same +environment as the minimum CPython version you'd like to support to minimize unknown behavior, +so, here, we build the extension in a CPython 3.9 environment. When built, this single wheel +will be runnable in any CPython environment 3.9+. To achieve this, there are three key lines +to note. + +The first is the specification of ``Py_LIMITED_API`` in ``extra_compile_args`` to the +minimum CPython version you would like to support: + +.. code-block:: python + + extra_compile_args={"cxx": ["-DPy_LIMITED_API=0x03090000"]}, + +Defining the ``Py_LIMITED_API`` flag helps verify that the extension is in fact +only using the `CPython Stable Limited API `_, +which is a requirement for the building a CPython agnostic wheel. If this requirement +is not met, it is possible to build a wheel that looks CPython agnostic but will crash, +or worse, be silently incorrect, in another CPython environment. Take care to avoid +using unstable CPython APIs, for example APIs from libtorch_python (in particular +pytorch/python bindings,) and to only use APIs from libtorch (ATen objects, operators +and the dispatcher). We strongly recommend defining the ``Py_LIMITED_API`` flag to +help ascertain the extension is compliant and safe as a CPython agnostic wheel. Note that +defining this flag is not a full guarantee that the built wheel is CPython agnostic, but +it is better than the wild wild west. There are several caveats mentioned in the +`Python docs `_, +and you should test and verify yourself that the wheel is truly agnostic for the relevant +CPython versions. + +The second and third lines specifying ``py_limited_api`` inform setuptools that you intend +to build a CPython agnostic wheel and will influence the naming of the wheel accordingly: + +.. code-block:: python + + setup(name="extension_cpp", + ext_modules=[ + cpp_extension.CppExtension( + ..., + py_limited_api=True)], # Build 1 wheel across multiple Python versions + ..., + options={"bdist_wheel": {"py_limited_api": "cp39"}} # 3.9 is minimum supported Python version + ) + +It is necessary to specify ``py_limited_api=True`` as an argument to CppExtension/ +CUDAExtension and also as an option to the ``"bdist_wheel"`` command with the minimal +supported CPython version (in this case, 3.9). Consequently, the ``setup`` in our +tutorial would build one properly named wheel that could be installed across multiple +CPython versions ``>=3.9``. + +If your extension uses CPython APIs outside the stable limited set, then you cannot +build a CPython agnostic wheel! You should build one wheel per CPython version instead, +like so: + +.. code-block:: python + + from setuptools import setup, Extension + from torch.utils import cpp_extension + + setup(name="extension_cpp", + ext_modules=[ + cpp_extension.CppExtension( + "extension_cpp", + ["muladd.cpp"])], + cmdclass={'build_ext': cpp_extension.BuildExtension}, + ) + Defining the custom op and adding backend implementations --------------------------------------------------------- @@ -174,8 +256,10 @@ To add ``torch.compile`` support for an operator, we must add a FakeTensor kerne known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an operator specifies how to compute the metadata of output tensors given the metadata of input tensors. +The FakeTensor kernel should return dummy Tensors of your choice with +the correct Tensor metadata (shape/strides/``dtype``/device). -We recommend that this be done from Python via the `torch.library.register_fake` API, +We recommend that this be done from Python via the ``torch.library.register_fake`` API, though it is possible to do this from C++ as well (see `The Custom Operators Manual `_ for more details). @@ -186,7 +270,9 @@ for more details). # before calling ``torch.library`` APIs that add registrations for the # C++ custom operator(s). The following import loads our # C++ custom operator definitions. - # See the next section for more details. + # Note that if you are striving for Python agnosticism, you should use + # the ``load_library(...)`` API call instead. See the next section for + # more details. from . import _C @torch.library.register_fake("extension_cpp::mymuladd") @@ -206,13 +292,89 @@ matters (importing in the wrong order will lead to an error). To use the custom operator with hybrid Python/C++ registrations, we must first load the C++ library that holds the custom operator definition -and then call the ``torch.library`` registration APIs. This can happen in one -of two ways: +and then call the ``torch.library`` registration APIs. This can happen in +three ways: + + +1. The first way to load the C++ library that holds the custom operator definition + is to define a dummy Python module for _C. Then, in Python, when you import the + module with ``import _C``, the ``.so`` files corresponding to the extension will + be loaded and the ``TORCH_LIBRARY`` and ``TORCH_LIBRARY_IMPL`` static initializers + will run. One can create a dummy Python module with ``PYBIND11_MODULE`` like below, + but you will notice that this does not compile with ``Py_LIMITED_API``, because + ``pybind11`` does not promise to only use the stable limited CPython API! With + the below code, you sadly cannot build a CPython agnostic wheel for your extension! + (Foreshadowing: I wonder what the second way is ;) ). + +.. code-block:: cpp + + // in, say, not_agnostic/csrc/extension_BAD.cpp + #include + + PYBIND11_MODULE("_C", m) {} + +.. code-block:: python + + # in, say, extension/__init__.py + from . import _C + +2. In this tutorial, because we value being able to build a single wheel across multiple + CPython versions, we will replace the unstable ``PYBIND11`` call with stable API calls. + The below code compiles with ``-DPy_LIMITED_API=0x03090000`` and successfully creates + a dummy Python module for our ``_C`` extension so that it can be imported from Python. + See `extension_cpp/__init__.py `_ + and `extension_cpp/csrc/muladd.cpp `_ + for more details: + +.. code-block:: cpp + + #include + + extern "C" { + /* Creates a dummy empty _C module that can be imported from Python. + The import from Python will load the .so consisting of this file + in this extension, so that the TORCH_LIBRARY static initializers + below are run. */ + PyObject* PyInit__C(void) + { + static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_C", /* name of module */ + NULL, /* module documentation, may be NULL */ + -1, /* size of per-interpreter state of the module, + or -1 if the module keeps state in global variables. */ + NULL, /* methods */ + }; + return PyModule_Create(&module_def); + } + } + +.. code-block:: python + + # in, say, extension/__init__.py + from . import _C + +3. If you want to avoid ``Python.h`` entirely in your C++ custom operator, you may + use ``torch.ops.load_library("/path/to/library.so")`` in Python to load the ``.so`` + file(s) compiled from the extension. Note that, with this method, there is no ``_C`` + Python module created for the extension so you cannot call ``import _C`` from Python. + Instead of relying on the import statement to trigger the custom operators to be + registered, ``torch.ops.load_library("/path/to/library.so")`` will do the trick. + The challenge then is shifted towards understanding where the ``.so`` files are + located so that you can load them, which is not always trivial: + +.. code-block:: python + + import torch + from pathlib import Path + + so_files = list(Path(__file__).parent.glob("_C*.so")) + assert ( + len(so_files) == 1 + ), f"Expected one _C*.so file, found {len(so_files)}" + torch.ops.load_library(so_files[0]) -1. If you're following this tutorial, importing the Python C extension module - we created will load the C++ custom operator definitions. -2. If your C++ custom operator is located in a shared library object, you can - also use ``torch.ops.load_library("/path/to/library.so")`` to load it. + from . import ops Adding training (autograd) support for an operator @@ -417,4 +579,4 @@ Conclusion In this tutorial, we went over the recommended approach to integrating Custom C++ and CUDA operators with PyTorch. The ``TORCH_LIBRARY/torch.library`` APIs are fairly low-level. For more information about how to use the API, see -`The Custom Operators Manual `_. +`The Custom Operators Manual `_. diff --git a/advanced_source/cpp_custom_ops_sycl.rst b/advanced_source/cpp_custom_ops_sycl.rst new file mode 100644 index 00000000000..3b3ad069b58 --- /dev/null +++ b/advanced_source/cpp_custom_ops_sycl.rst @@ -0,0 +1,274 @@ +.. _cpp-custom-ops-tutorial-sycl: + +Custom SYCL Operators +===================== + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to integrate custom operators written in SYCL with PyTorch + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.8 or later + * Basic understanding of SYCL programming + +.. note:: + + ``SYCL`` serves as the backend programming language for Intel GPUs (device label ``xpu``). For configuration details, see: + `Getting Started on Intel GPUs `_. The Intel Compiler, which comes bundled with Intel Deep Learning Essentials, handles ``SYCL`` compilation. Ensure you install and activate the compiler environment prior to executing the code examples in this tutorial. + +PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc). +However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the +best path to authoring a custom operator written in SYCL. Tutorials for C++ and CUDA operators are available in the :ref:`cpp-custom-ops-tutorial`. + +Follow the structure to create a custom SYCL operator: + +.. code-block:: text + + sycl_example/ + ├── setup.py + ├── sycl_extension + │ ├── __init__.py + │ ├── muladd.sycl + │ └── ops.py + └── test_sycl_extension.py + +Setting up the Build System +--------------------------- + +If you need to compile **SYCL** code (for example, ``.sycl`` files), use `torch.utils.cpp_extension.SyclExtension `_. +The setup process is very similar to C++/CUDA, except the compilation arguments need to be adjusted for SYCL. + +Using ``sycl_extension`` is as straightforward as writing the following ``setup.py``: + +.. code-block:: python + + import os + import torch + import glob + from setuptools import find_packages, setup + from torch.utils.cpp_extension import SyclExtension, BuildExtension + + library_name = "sycl_extension" + py_limited_api = True + extra_compile_args = { + "cxx": ["-O3", + "-fdiagnostics-color=always", + "-DPy_LIMITED_API=0x03090000"], + "sycl": ["-O3" ] + } + + assert(torch.xpu.is_available()), "XPU is not available, please check your environment" + # Source files collection + this_dir = os.path.dirname(os.path.curdir) + extensions_dir = os.path.join(this_dir, library_name) + sources = list(glob.glob(os.path.join(extensions_dir, "*.sycl"))) + # Construct extension + ext_modules = [ + SyclExtension( + f"{library_name}._C", + sources, + extra_compile_args=extra_compile_args, + py_limited_api=py_limited_api, + ) + ] + setup( + name=library_name, + packages=find_packages(), + ext_modules=ext_modules, + install_requires=["torch"], + description="Simple Example of PyTorch Sycl extensions", + cmdclass={"build_ext": BuildExtension}, + options={"bdist_wheel": {"py_limited_api": "cp39"}} if py_limited_api else {}, + ) + + +Defining the custom op and adding backend implementations +--------------------------------------------------------- +First, let's write a SYCL function that computes ``mymuladd``: + +In order to use this from PyTorch’s Python frontend, we need to register it +as a PyTorch operator using the ``TORCH_LIBRARY`` API. This will automatically +bind the operator to Python. + + +If you also have a SYCL implementation of ``myaddmul``, you can also register it +in a separate ``TORCH_LIBRARY_IMPL`` block: + +.. code-block:: cpp + + #include + #include + #include + #include + #include + + namespace sycl_extension { + // MulAdd Kernel: result = a * b + c + static void muladd_kernel( + int numel, const float* a, const float* b, float c, float* result, + const sycl::nd_item<1>& item) { + int idx = item.get_global_id(0); + if (idx < numel) { + result[idx] = a[idx] * b[idx] + c; + } + } + + class MulAddKernelFunctor { + public: + MulAddKernelFunctor(int _numel, const float* _a, const float* _b, float _c, float* _result) + : numel(_numel), a(_a), b(_b), c(_c), result(_result) {} + void operator()(const sycl::nd_item<1>& item) const { + muladd_kernel(numel, a, b, c, result, item); + } + + private: + int numel; + const float* a; + const float* b; + float c; + float* result; + }; + + at::Tensor mymuladd_xpu(const at::Tensor& a, const at::Tensor& b, double c) { + TORCH_CHECK(a.sizes() == b.sizes(), "a and b must have the same shape"); + TORCH_CHECK(a.dtype() == at::kFloat, "a must be a float tensor"); + TORCH_CHECK(b.dtype() == at::kFloat, "b must be a float tensor"); + TORCH_CHECK(a.device().is_xpu(), "a must be an XPU tensor"); + TORCH_CHECK(b.device().is_xpu(), "b must be an XPU tensor"); + + at::Tensor a_contig = a.contiguous(); + at::Tensor b_contig = b.contiguous(); + at::Tensor result = at::empty_like(a_contig); + + const float* a_ptr = a_contig.data_ptr(); + const float* b_ptr = b_contig.data_ptr(); + float* res_ptr = result.data_ptr(); + int numel = a_contig.numel(); + + sycl::queue& queue = c10::xpu::getCurrentXPUStream().queue(); + constexpr int threads = 256; + int blocks = (numel + threads - 1) / threads; + + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<1>(blocks * threads, threads), + MulAddKernelFunctor(numel, a_ptr, b_ptr, static_cast(c), res_ptr) + ); + }); + + return result; + } + // Defines the operators + TORCH_LIBRARY(sycl_extension, m) { + m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor"); + } + + // ================================================== + // Register SYCL Implementations to Torch Library + // ================================================== + TORCH_LIBRARY_IMPL(sycl_extension, XPU, m) { + m.impl("mymuladd", &mymuladd_xpu); + } + + } // namespace sycl_extension + + + +Create a Python Interface +------------------------- + +Create a Python interface for our operator in the ``sycl_extension/ops.py`` file: + +.. code-block:: python + + import torch + from torch import Tensor + __all__ = ["mymuladd"] + + def mymuladd(a: Tensor, b: Tensor, c: float) -> Tensor: + """Performs a * b + c in an efficient fused kernel""" + return torch.ops.sycl_extension.mymuladd.default(a, b, c) + +Initialize Package +------------------ + +Create ``sycl_extension/__init__.py`` file to make the package importable: + +.. code-block:: python + + import ctypes + from pathlib import Path + + import torch + + current_dir = Path(__file__).parent.parent + build_dir = current_dir / "build" + so_files = list(build_dir.glob("**/*.so")) + + assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}" + + with torch._ops.dl_open_guard(): + loaded_lib = ctypes.CDLL(so_files[0]) + + from . import ops + + __all__ = [ + "loaded_lib", + "ops", + ] + +Testing SYCL extension operator +------------------- + +Use simple test to verify that the operator works correctly. + +.. code-block:: python + + import torch + from torch.testing._internal.common_utils import TestCase + import unittest + import sycl_extension + + def reference_muladd(a, b, c): + return a * b + c + + class TestMyMulAdd(TestCase): + def sample_inputs(self, device, *, requires_grad=False): + def make_tensor(*size): + return torch.randn(size, device=device, requires_grad=requires_grad) + + def make_nondiff_tensor(*size): + return torch.randn(size, device=device, requires_grad=False) + + return [ + [make_tensor(3), make_tensor(3), 1], + [make_tensor(20), make_tensor(20), 3.14], + [make_tensor(20), make_nondiff_tensor(20), -123], + [make_nondiff_tensor(2, 3), make_tensor(2, 3), -0.3], + ] + + def _test_correctness(self, device): + samples = self.sample_inputs(device) + for args in samples: + result = sycl_extension.ops.mymuladd(*args) + expected = reference_muladd(*args) + torch.testing.assert_close(result, expected) + + @unittest.skipIf(not torch.xpu.is_available(), "requires Intel GPU") + def test_correctness_xpu(self): + self._test_correctness("xpu") + + if __name__ == "__main__": + unittest.main() + +This test checks the correctness of the custom operator by comparing its output against a reference implementation. + +Conclusion +---------- + +In this tutorial, we demonstrated how to implement and compile custom SYCL operators for PyTorch. We specifically showcased an inference operation ``muladd``. For adding backward support or enabling torch.compile compatibility, please refer to :ref:`cpp-custom-ops-tutorial`. diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst index 45556a5320f..56c4bcbaae7 100644 --- a/advanced_source/cpp_export.rst +++ b/advanced_source/cpp_export.rst @@ -1,387 +1,3 @@ -Loading a TorchScript Model in C++ -===================================== - -As its name suggests, the primary interface to PyTorch is the Python -programming language. While Python is a suitable and preferred language for -many scenarios requiring dynamism and ease of iteration, there are equally many -situations where precisely these properties of Python are unfavorable. One -environment in which the latter often applies is *production* -- the land of -low latencies and strict deployment requirements. For production scenarios, C++ -is very often the language of choice, even if only to bind it into another -language like Java, Rust or Go. The following paragraphs will outline the path -PyTorch provides to go from an existing Python model to a serialized -representation that can be *loaded* and *executed* purely from C++, with no -dependency on Python. - -Step 1: Converting Your PyTorch Model to Torch Script ------------------------------------------------------ - -A PyTorch model's journey from Python to C++ is enabled by `Torch Script -`_, a representation of a PyTorch -model that can be understood, compiled and serialized by the Torch Script -compiler. If you are starting out from an existing PyTorch model written in the -vanilla "eager" API, you must first convert your model to Torch Script. In the -most common cases, discussed below, this requires only little effort. If you -already have a Torch Script module, you can skip to the next section of this -tutorial. - -There exist two ways of converting a PyTorch model to Torch Script. The first -is known as *tracing*, a mechanism in which the structure of the model is -captured by evaluating it once using example inputs, and recording the flow of -those inputs through the model. This is suitable for models that make limited -use of control flow. The second approach is to add explicit annotations to your -model that inform the Torch Script compiler that it may directly parse and -compile your model code, subject to the constraints imposed by the Torch Script -language. - -.. tip:: - - You can find the complete documentation for both of these methods, as well as - further guidance on which to use, in the official `Torch Script - reference `_. - -Converting to Torch Script via Tracing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To convert a PyTorch model to Torch Script via tracing, you must pass an -instance of your model along with an example input to the ``torch.jit.trace`` -function. This will produce a ``torch.jit.ScriptModule`` object with the trace -of your model evaluation embedded in the module's ``forward`` method:: - - import torch - import torchvision - - # An instance of your model. - model = torchvision.models.resnet18() - - # An example input you would normally provide to your model's forward() method. - example = torch.rand(1, 3, 224, 224) - - # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. - traced_script_module = torch.jit.trace(model, example) - -The traced ``ScriptModule`` can now be evaluated identically to a regular -PyTorch module:: - - In[1]: output = traced_script_module(torch.ones(1, 3, 224, 224)) - In[2]: output[0, :5] - Out[2]: tensor([-0.2698, -0.0381, 0.4023, -0.3010, -0.0448], grad_fn=) - -Converting to Torch Script via Annotation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Under certain circumstances, such as if your model employs particular forms of -control flow, you may want to write your model in Torch Script directly and -annotate your model accordingly. For example, say you have the following -vanilla Pytorch model:: - - import torch - - class MyModule(torch.nn.Module): - def __init__(self, N, M): - super(MyModule, self).__init__() - self.weight = torch.nn.Parameter(torch.rand(N, M)) - - def forward(self, input): - if input.sum() > 0: - output = self.weight.mv(input) - else: - output = self.weight + input - return output - - -Because the ``forward`` method of this module uses control flow that is -dependent on the input, it is not suitable for tracing. Instead, we can convert -it to a ``ScriptModule``. -In order to convert the module to the ``ScriptModule``, one needs to -compile the module with ``torch.jit.script`` as follows:: - - class MyModule(torch.nn.Module): - def __init__(self, N, M): - super(MyModule, self).__init__() - self.weight = torch.nn.Parameter(torch.rand(N, M)) - - def forward(self, input): - if input.sum() > 0: - output = self.weight.mv(input) - else: - output = self.weight + input - return output - - my_module = MyModule(10,20) - sm = torch.jit.script(my_module) - -If you need to exclude some methods in your ``nn.Module`` -because they use Python features that TorchScript doesn't support yet, -you could annotate those with ``@torch.jit.ignore`` - -``sm`` is an instance of -``ScriptModule`` that is ready for serialization. - -Step 2: Serializing Your Script Module to a File -------------------------------------------------- - -Once you have a ``ScriptModule`` in your hands, either from tracing or -annotating a PyTorch model, you are ready to serialize it to a file. Later on, -you'll be able to load the module from this file in C++ and execute it without -any dependency on Python. Say we want to serialize the ``ResNet18`` model shown -earlier in the tracing example. To perform this serialization, simply call -`save `_ -on the module and pass it a filename:: - - traced_script_module.save("traced_resnet_model.pt") - -This will produce a ``traced_resnet_model.pt`` file in your working directory. -If you also would like to serialize ``sm``, call ``sm.save("my_module_model.pt")`` -We have now officially left the realm of Python and are ready to cross over to the sphere -of C++. - -Step 3: Loading Your Script Module in C++ ------------------------------------------- - -To load your serialized PyTorch model in C++, your application must depend on -the PyTorch C++ API -- also known as *LibTorch*. The LibTorch distribution -encompasses a collection of shared libraries, header files and CMake build -configuration files. While CMake is not a requirement for depending on -LibTorch, it is the recommended approach and will be well supported into the -future. For this tutorial, we will be building a minimal C++ application using -CMake and LibTorch that simply loads and executes a serialized PyTorch model. - -A Minimal C++ Application -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Let's begin by discussing the code to load a module. The following will already -do: - -.. code-block:: cpp - - #include // One-stop header. - - #include - #include - - int main(int argc, const char* argv[]) { - if (argc != 2) { - std::cerr << "usage: example-app \n"; - return -1; - } - - - torch::jit::script::Module module; - try { - // Deserialize the ScriptModule from a file using torch::jit::load(). - module = torch::jit::load(argv[1]); - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - - std::cout << "ok\n"; - } - - -The ```` header encompasses all relevant includes from the -LibTorch library necessary to run the example. Our application accepts the file -path to a serialized PyTorch ``ScriptModule`` as its only command line argument -and then proceeds to deserialize the module using the ``torch::jit::load()`` -function, which takes this file path as input. In return we receive a ``torch::jit::script::Module`` -object. We will examine how to execute it in a moment. - -Depending on LibTorch and Building the Application -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Assume we stored the above code into a file called ``example-app.cpp``. A -minimal ``CMakeLists.txt`` to build it could look as simple as: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(custom_ops) - - find_package(Torch REQUIRED) - - add_executable(example-app example-app.cpp) - target_link_libraries(example-app "${TORCH_LIBRARIES}") - set_property(TARGET example-app PROPERTY CXX_STANDARD 17) - -The last thing we need to build the example application is the LibTorch -distribution. You can always grab the latest stable release from the `download -page `_ on the PyTorch website. If you download and unzip -the latest archive, you should receive a folder with the following directory -structure: - -.. code-block:: sh - - libtorch/ - bin/ - include/ - lib/ - share/ - -- The ``lib/`` folder contains the shared libraries you must link against, -- The ``include/`` folder contains header files your program will need to include, -- The ``share/`` folder contains the necessary CMake configuration to enable the simple ``find_package(Torch)`` command above. - -.. tip:: - On Windows, debug and release builds are not ABI-compatible. If you plan to - build your project in debug mode, please try the debug version of LibTorch. - Also, make sure you specify the correct configuration in the ``cmake --build .`` - line below. - -The last step is building the application. For this, assume our example -directory is laid out like this: - -.. code-block:: sh - - example-app/ - CMakeLists.txt - example-app.cpp - -We can now run the following commands to build the application from within the -``example-app/`` folder: - -.. code-block:: sh - - mkdir build - cd build - cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch .. - cmake --build . --config Release - -where ``/path/to/libtorch`` should be the full path to the unzipped LibTorch -distribution. If all goes well, it will look something like this: - -.. code-block:: sh - - root@4b5a67132e81:/example-app# mkdir build - root@4b5a67132e81:/example-app# cd build - root@4b5a67132e81:/example-app/build# cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch .. - -- The C compiler identification is GNU 5.4.0 - -- The CXX compiler identification is GNU 5.4.0 - -- Check for working C compiler: /usr/bin/cc - -- Check for working C compiler: /usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /usr/bin/c++ - -- Check for working CXX compiler: /usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Looking for pthread_create - -- Looking for pthread_create - not found - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Configuring done - -- Generating done - -- Build files have been written to: /example-app/build - root@4b5a67132e81:/example-app/build# make - Scanning dependencies of target example-app - [ 50%] Building CXX object CMakeFiles/example-app.dir/example-app.cpp.o - [100%] Linking CXX executable example-app - [100%] Built target example-app - -If we supply the path to the traced ``ResNet18`` model ``traced_resnet_model.pt`` we created earlier -to the resulting ``example-app`` binary, we should be rewarded with a friendly -"ok". Please note, if try to run this example with ``my_module_model.pt`` you will get an error saying that -your input is of an incompatible shape. ``my_module_model.pt`` expects 1D instead of 4D. - -.. code-block:: sh - - root@4b5a67132e81:/example-app/build# ./example-app /traced_resnet_model.pt - ok - -Step 4: Executing the Script Module in C++ ------------------------------------------- - -Having successfully loaded our serialized ``ResNet18`` in C++, we are now just a -couple lines of code away from executing it! Let's add those lines to our C++ -application's ``main()`` function: - -.. code-block:: cpp - - // Create a vector of inputs. - std::vector inputs; - inputs.push_back(torch::ones({1, 3, 224, 224})); - - // Execute the model and turn its output into a tensor. - at::Tensor output = module.forward(inputs).toTensor(); - std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; - -The first two lines set up the inputs to our model. We create a vector of -``torch::jit::IValue`` (a type-erased value type ``script::Module`` methods -accept and return) and add a single input. To create the input tensor, we use -``torch::ones()``, the equivalent to ``torch.ones`` in the C++ API. We then -run the ``script::Module``'s ``forward`` method, passing it the input vector we -created. In return we get a new ``IValue``, which we convert to a tensor by -calling ``toTensor()``. - -.. tip:: - - To learn more about functions like ``torch::ones`` and the PyTorch C++ API in - general, refer to its documentation at https://pytorch.org/cppdocs. The - PyTorch C++ API provides near feature parity with the Python API, allowing - you to further manipulate and process tensors just like in Python. - -In the last line, we print the first five entries of the output. Since we -supplied the same input to our model in Python earlier in this tutorial, we -should ideally see the same output. Let's try it out by re-compiling our -application and running it with the same serialized model: - -.. code-block:: sh - - root@4b5a67132e81:/example-app/build# make - Scanning dependencies of target example-app - [ 50%] Building CXX object CMakeFiles/example-app.dir/example-app.cpp.o - [100%] Linking CXX executable example-app - [100%] Built target example-app - root@4b5a67132e81:/example-app/build# ./example-app traced_resnet_model.pt - -0.2698 -0.0381 0.4023 -0.3010 -0.0448 - [ Variable[CPUFloatType]{1,5} ] - - -For reference, the output in Python previously was:: - - tensor([-0.2698, -0.0381, 0.4023, -0.3010, -0.0448], grad_fn=) - -Looks like a good match! - -.. tip:: - - To move your model to GPU memory, you can write ``model.to(at::kCUDA);``. - Make sure the inputs to a model are also living in CUDA memory - by calling ``tensor.to(at::kCUDA)``, which will return a new tensor in CUDA - memory. - -Step 5: Getting Help and Exploring the API ------------------------------------------- - -This tutorial has hopefully equipped you with a general understanding of a -PyTorch model's path from Python to C++. With the concepts described in this -tutorial, you should be able to go from a vanilla, "eager" PyTorch model, to a -compiled ``ScriptModule`` in Python, to a serialized file on disk and -- to -close the loop -- to an executable ``script::Module`` in C++. - -Of course, there are many concepts we did not cover. For example, you may find -yourself wanting to extend your ``ScriptModule`` with a custom operator -implemented in C++ or CUDA, and executing this custom operator inside your -``ScriptModule`` loaded in your pure C++ production environment. The good news -is: this is possible, and well supported! For now, you can explore `this -`_ folder -for examples, and we will follow up with a tutorial shortly. In the time being, -the following links may be generally helpful: - -- The Torch Script reference: https://pytorch.org/docs/master/jit.html -- The PyTorch C++ API documentation: https://pytorch.org/cppdocs/ -- The PyTorch Python API documentation: https://pytorch.org/docs/ - -As always, if you run into any problems or have questions, you can use our -`forum `_ or `GitHub issues -`_ to get in touch. +.. warning:: + TorchScript is deprecated, please use + `torch.export `__ instead. \ No newline at end of file diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst deleted file mode 100644 index cb0e990797e..00000000000 --- a/advanced_source/cpp_extension.rst +++ /dev/null @@ -1,1205 +0,0 @@ -Custom C++ and CUDA Extensions -============================== -**Author**: `Peter Goldsborough `_ - - -PyTorch provides a plethora of operations related to neural networks, arbitrary -tensor algebra, data wrangling and other purposes. However, you may still find -yourself in need of a more customized operation. For example, you might want to -use a novel activation function you found in a paper, or implement an operation -you developed as part of your research. - -The easiest way of integrating such a custom operation in PyTorch is to write it -in Python by extending :class:`Function` and :class:`Module` as outlined `here -`_. This gives you the full -power of automatic differentiation (spares you from writing derivative -functions) as well as the usual expressiveness of Python. However, there may be -times when your operation is better implemented in C++. For example, your code -may need to be *really* fast because it is called very frequently in your model -or is very expensive even for few calls. Another plausible reason is that it -depends on or interacts with other C or C++ libraries. To address such cases, -PyTorch provides a very easy way of writing custom *C++ extensions*. - -C++ extensions are a mechanism we have developed to allow users (you) to create -PyTorch operators defined *out-of-source*, i.e. separate from the PyTorch -backend. This approach is *different* from the way native PyTorch operations are -implemented. C++ extensions are intended to spare you much of the boilerplate -associated with integrating an operation with PyTorch's backend while providing -you with a high degree of flexibility for your PyTorch-based projects. -Nevertheless, once you have defined your operation as a C++ extension, turning -it into a native PyTorch function is largely a matter of code organization, -which you can tackle after the fact if you decide to contribute your operation -upstream. - -Motivation and Example ----------------------- - -The rest of this note will walk through a practical example of writing and using -a C++ (and CUDA) extension. If you are being chased or someone will fire you if -you don't get that op done by the end of the day, you can skip this section and -head straight to the implementation details in the next section. - -Let's say you've come up with a new kind of recurrent unit that you found to -have superior properties compared to the state of the art. This recurrent unit -is similar to an LSTM, but differs in that it lacks a *forget gate* and uses an -*Exponential Linear Unit* (ELU) as its internal activation function. Because -this unit never forgets, we'll call it *LLTM*, or *Long-Long-Term-Memory* unit. - -The two ways in which LLTMs differ from vanilla LSTMs are significant enough -that we can't configure PyTorch's ``LSTMCell`` for our purposes, so we'll have to -create a custom cell. The first and easiest approach for this -- and likely in -all cases a good first step -- is to implement our desired functionality in -plain PyTorch with Python. For this, we need to subclass -:class:`torch.nn.Module` and implement the forward pass of the LLTM. This would -look something like this:: - - class LLTM(torch.nn.Module): - def __init__(self, input_features, state_size): - super(LLTM, self).__init__() - self.input_features = input_features - self.state_size = state_size - # 3 * state_size for input gate, output gate and candidate cell gate. - # input_features + state_size because we will multiply with [input, h]. - self.weights = torch.nn.Parameter( - torch.empty(3 * state_size, input_features + state_size)) - self.bias = torch.nn.Parameter(torch.empty(3 * state_size)) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.state_size) - for weight in self.parameters(): - weight.data.uniform_(-stdv, +stdv) - - def forward(self, input, state): - old_h, old_cell = state - X = torch.cat([old_h, input], dim=1) - - # Compute the input, output and candidate cell gates with one MM. - gate_weights = F.linear(X, self.weights, self.bias) - # Split the combined gate weight matrix into its components. - gates = gate_weights.chunk(3, dim=1) - - input_gate = torch.sigmoid(gates[0]) - output_gate = torch.sigmoid(gates[1]) - # Here we use an ELU instead of the usual tanh. - candidate_cell = F.elu(gates[2]) - - # Compute the new cell state. - new_cell = old_cell + candidate_cell * input_gate - # Compute the new hidden state and output. - new_h = torch.tanh(new_cell) * output_gate - - return new_h, new_cell - -which we could then use as expected:: - - import torch - - X = torch.randn(batch_size, input_features) - h = torch.randn(batch_size, state_size) - C = torch.randn(batch_size, state_size) - - rnn = LLTM(input_features, state_size) - - new_h, new_C = rnn(X, (h, C)) - -Naturally, if at all possible and plausible, you should use this approach to -extend PyTorch. Since PyTorch has highly optimized implementations of its -operations for CPU *and* GPU, powered by libraries such as `NVIDIA cuDNN -`_, `Intel MKL -`_ or `NNPACK -`_, PyTorch code like above will often be -fast enough. However, we can also see why, under certain circumstances, there is -room for further performance improvements. The most obvious reason is that -PyTorch has no knowledge of the *algorithm* you are implementing. It knows only -of the individual operations you use to compose your algorithm. As such, PyTorch -must execute your operations individually, one after the other. Since each -individual call to the implementation (or *kernel*) of an operation, which may -involve the launch of a CUDA kernel, has a certain amount of overhead, this -overhead may become significant across many function calls. Furthermore, the -Python interpreter that is running our code can itself slow down our program. - -A definite method of speeding things up is therefore to rewrite parts in C++ (or -CUDA) and *fuse* particular groups of operations. Fusing means combining the -implementations of many functions into a single function, which profits from -fewer kernel launches as well as other optimizations we can perform with -increased visibility of the global flow of data. - -Let's see how we can use C++ extensions to implement a *fused* version of the -LLTM. We'll begin by writing it in plain C++, using the `ATen -`_ library that powers much of PyTorch's -backend, and see how easily it lets us translate our Python code. We'll then -speed things up even more by moving parts of the model to CUDA kernel to benefit -from the massive parallelism GPUs provide. - -Writing a C++ Extension ------------------------ - -C++ extensions come in two flavors: They can be built "ahead of time" with -:mod:`setuptools`, or "just in time" via -:func:`torch.utils.cpp_extension.load`. We'll begin with the first approach and -discuss the latter later. - -Building with :mod:`setuptools` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For the "ahead of time" flavor, we build our C++ extension by writing a -``setup.py`` script that uses setuptools to compile our C++ code. For the LLTM, it -looks as simple as this:: - - from setuptools import setup, Extension - from torch.utils import cpp_extension - - setup(name='lltm_cpp', - ext_modules=[cpp_extension.CppExtension('lltm_cpp', ['lltm.cpp'])], - cmdclass={'build_ext': cpp_extension.BuildExtension}) - -In this code, :class:`CppExtension` is a convenience wrapper around -:class:`setuptools.Extension` that passes the correct include paths and sets -the language of the extension to C++. The equivalent vanilla :mod:`setuptools` -code would simply be:: - - Extension( - name='lltm_cpp', - sources=['lltm.cpp'], - include_dirs=cpp_extension.include_paths(), - language='c++') - -:class:`BuildExtension` performs a number of required configuration steps and -checks and also manages mixed compilation in the case of mixed C++/CUDA -extensions. And that's all we really need to know about building C++ extensions -for now! Let's now take a look at the implementation of our C++ extension, -which goes into ``lltm.cpp``. - -Writing the C++ Op -^^^^^^^^^^^^^^^^^^ - -Let's start implementing the LLTM in C++! One function we'll need for the -backward pass is the derivative of the sigmoid. This is a small enough piece of -code to discuss the overall environment that is available to us when writing C++ -extensions: - -.. code-block:: cpp - - #include - - #include - - torch::Tensor d_sigmoid(torch::Tensor z) { - auto s = torch::sigmoid(z); - return (1 - s) * s; - } - -```` is the one-stop header to include all the necessary PyTorch -bits to write C++ extensions. It includes: - -- The ATen library, which is our primary API for tensor computation, -- `pybind11 `_, which is how we create Python bindings for our C++ code, -- Headers that manage the details of interaction between ATen and pybind11. - -The implementation of :func:`d_sigmoid` shows how to use the ATen API. -PyTorch's tensor and variable interface is generated automatically from the -ATen library, so we can more or less translate our Python implementation 1:1 -into C++. Our primary datatype for all computations will be -:class:`torch::Tensor`. Its full API can be inspected `here -`_. Notice -also that we can include ```` or *any other C or C++ header* -- we have -the full power of C++11 at our disposal. - -Note that CUDA-11.5 nvcc will hit internal compiler error while parsing torch/extension.h on Windows. -To workaround the issue, move python binding logic to pure C++ file. -Example use: - -.. code-block:: cpp - - #include - at::Tensor SigmoidAlphaBlendForwardCuda(....) - -Instead of: - -.. code-block:: cpp - - #include - torch::Tensor SigmoidAlphaBlendForwardCuda(...) - -Currently open issue for nvcc bug `here -`_. -Complete workaround code example `here -`_. - -Forward Pass -************ - -Next we can port our entire forward pass to C++: - -.. code-block:: cpp - - #include - - std::vector lltm_forward( - torch::Tensor input, - torch::Tensor weights, - torch::Tensor bias, - torch::Tensor old_h, - torch::Tensor old_cell) { - auto X = torch::cat({old_h, input}, /*dim=*/1); - - auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); - auto gates = gate_weights.chunk(3, /*dim=*/1); - - auto input_gate = torch::sigmoid(gates[0]); - auto output_gate = torch::sigmoid(gates[1]); - auto candidate_cell = torch::elu(gates[2], /*alpha=*/1.0); - - auto new_cell = old_cell + candidate_cell * input_gate; - auto new_h = torch::tanh(new_cell) * output_gate; - - return {new_h, - new_cell, - input_gate, - output_gate, - candidate_cell, - X, - gate_weights}; - } - -Backward Pass -************* - -The C++ extension API currently does not provide a way of automatically -generating a backwards function for us. As such, we have to also implement the -backward pass of our LLTM, which computes the derivative of the loss with -respect to each input of the forward pass. Ultimately, we will plop both the -forward and backward function into a :class:`torch.autograd.Function` to create -a nice Python binding. The backward function is slightly more involved, so -we'll not dig deeper into the code (if you are interested, `Alex Graves' thesis -`_ is a good read for more -information on this): - -.. code-block:: cpp - - // tanh'(z) = 1 - tanh^2(z) - torch::Tensor d_tanh(torch::Tensor z) { - return 1 - z.tanh().pow(2); - } - - // elu'(z) = relu'(z) + { alpha * exp(z) if (alpha * (exp(z) - 1)) < 0, else 0} - torch::Tensor d_elu(torch::Tensor z, torch::Scalar alpha = 1.0) { - auto e = z.exp(); - auto mask = (alpha * (e - 1)) < 0; - return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e); - } - - std::vector lltm_backward( - torch::Tensor grad_h, - torch::Tensor grad_cell, - torch::Tensor new_cell, - torch::Tensor input_gate, - torch::Tensor output_gate, - torch::Tensor candidate_cell, - torch::Tensor X, - torch::Tensor gate_weights, - torch::Tensor weights) { - auto d_output_gate = torch::tanh(new_cell) * grad_h; - auto d_tanh_new_cell = output_gate * grad_h; - auto d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell; - - auto d_old_cell = d_new_cell; - auto d_candidate_cell = input_gate * d_new_cell; - auto d_input_gate = candidate_cell * d_new_cell; - - auto gates = gate_weights.chunk(3, /*dim=*/1); - d_input_gate *= d_sigmoid(gates[0]); - d_output_gate *= d_sigmoid(gates[1]); - d_candidate_cell *= d_elu(gates[2]); - - auto d_gates = - torch::cat({d_input_gate, d_output_gate, d_candidate_cell}, /*dim=*/1); - - auto d_weights = d_gates.t().mm(X); - auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true); - - auto d_X = d_gates.mm(weights); - const auto state_size = grad_h.size(1); - auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); - auto d_input = d_X.slice(/*dim=*/1, state_size); - - return {d_old_h, d_input, d_weights, d_bias, d_old_cell}; - } - -Binding to Python -^^^^^^^^^^^^^^^^^ - -Once you have your operation written in C++ and ATen, you can use pybind11 to -bind your C++ functions or classes into Python in a very simple manner. -Questions or issues you have about this part of PyTorch C++ extensions will -largely be addressed by `pybind11 documentation -`_. - -For our extensions, the necessary binding code spans only four lines: - -.. code-block:: cpp - - PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &lltm_forward, "LLTM forward"); - m.def("backward", &lltm_backward, "LLTM backward"); - } - -One bit to note here is the macro ``TORCH_EXTENSION_NAME``. The torch extension -build will define it as the name you give your extension in the ``setup.py`` -script. In this case, the value of ``TORCH_EXTENSION_NAME`` would be "lltm_cpp". -This is to avoid having to maintain the name of the extension in two places -(the build script and your C++ code), as a mismatch between the two can lead to -nasty and hard to track issues. - -Using Your Extension -^^^^^^^^^^^^^^^^^^^^ - -We are now set to import our extension in PyTorch. At this point, your directory -structure could look something like this:: - - pytorch/ - lltm-extension/ - lltm.cpp - setup.py - -Now, run ``python setup.py install`` to build and install your extension. This -should look something like this:: - - running install - running bdist_egg - running egg_info - creating lltm_cpp.egg-info - writing lltm_cpp.egg-info/PKG-INFO - writing dependency_links to lltm_cpp.egg-info/dependency_links.txt - writing top-level names to lltm_cpp.egg-info/top_level.txt - writing manifest file 'lltm_cpp.egg-info/SOURCES.txt' - reading manifest file 'lltm_cpp.egg-info/SOURCES.txt' - writing manifest file 'lltm_cpp.egg-info/SOURCES.txt' - installing library code to build/bdist.linux-x86_64/egg - running install_lib - running build_ext - building 'lltm_cpp' extension - creating build - creating build/temp.linux-x86_64-3.7 - gcc -pthread -B ~/local/miniconda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I~/local/miniconda/lib/python3.7/site-packages/torch/include -I~/local/miniconda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I~/local/miniconda/lib/python3.7/site-packages/torch/include/TH -I~/local/miniconda/lib/python3.7/site-packages/torch/include/THC -I~/local/miniconda/include/python3.7m -c lltm.cpp -o build/temp.linux-x86_64-3.7/lltm.o -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=lltm_cpp -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++11 - cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++ - creating build/lib.linux-x86_64-3.7 - g++ -pthread -shared -B ~/local/miniconda/compiler_compat -L~/local/miniconda/lib -Wl,-rpath=~/local/miniconda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.7/lltm.o -o build/lib.linux-x86_64-3.7/lltm_cpp.cpython-37m-x86_64-linux-gnu.so - creating build/bdist.linux-x86_64 - creating build/bdist.linux-x86_64/egg - copying build/lib.linux-x86_64-3.7/lltm_cpp.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg - creating stub loader for lltm_cpp.cpython-37m-x86_64-linux-gnu.so - byte-compiling build/bdist.linux-x86_64/egg/lltm_cpp.py to lltm_cpp.cpython-37.pyc - creating build/bdist.linux-x86_64/egg/EGG-INFO - copying lltm_cpp.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO - copying lltm_cpp.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO - copying lltm_cpp.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO - copying lltm_cpp.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO - writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt - zip_safe flag not set; analyzing archive contents... - __pycache__.lltm_cpp.cpython-37: module references __file__ - creating 'dist/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it - removing 'build/bdist.linux-x86_64/egg' (and everything under it) - Processing lltm_cpp-0.0.0-py3.7-linux-x86_64.egg - removing '~/local/miniconda/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg' (and everything under it) - creating ~/local/miniconda/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg - Extracting lltm_cpp-0.0.0-py3.7-linux-x86_64.egg to ~/local/miniconda/lib/python3.7/site-packages - lltm-cpp 0.0.0 is already the active version in easy-install.pth - - Installed ~/local/miniconda/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg - Processing dependencies for lltm-cpp==0.0.0 - Finished processing dependencies for lltm-cpp==0.0.0 - - -A small note on compilers: Due to ABI versioning issues, the compiler you use to -build your C++ extension must be *ABI-compatible* with the compiler PyTorch was -built with. In practice, this means that you must use GCC version 4.9 and above on Linux. -For Ubuntu 16.04 and other more-recent Linux distributions, this should be the -default compiler already. On MacOS, you must use clang (which does not have any ABI versioning issues). In the worst -case, you can build PyTorch from source with your compiler and then build the -extension with that same compiler. - -Once your extension is built, you can simply import it in Python, using the -name you specified in your ``setup.py`` script. Just be sure to ``import -torch`` first, as this will resolve some symbols that the dynamic linker must -see:: - - In [1]: import torch - In [2]: import lltm_cpp - In [3]: lltm_cpp.forward - Out[3]: - -If we call ``help()`` on the function or module, we can see that its signature -matches our C++ code:: - - In[4] help(lltm_cpp.forward) - forward(...) method of builtins.PyCapsule instance - forward(arg0: torch::Tensor, arg1: torch::Tensor, arg2: torch::Tensor, arg3: torch::Tensor, arg4: torch::Tensor) -> List[torch::Tensor] - - LLTM forward - -Since we are now able to call our C++ functions from Python, we can wrap them -with :class:`torch.autograd.Function` and :class:`torch.nn.Module` to make them first -class citizens of PyTorch:: - - import math - import torch - - # Our module! - import lltm_cpp - - class LLTMFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, input, weights, bias, old_h, old_cell): - outputs = lltm_cpp.forward(input, weights, bias, old_h, old_cell) - new_h, new_cell = outputs[:2] - variables = outputs[1:] + [weights] - ctx.save_for_backward(*variables) - - return new_h, new_cell - - @staticmethod - def backward(ctx, grad_h, grad_cell): - outputs = lltm_cpp.backward( - grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors) - d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs - return d_input, d_weights, d_bias, d_old_h, d_old_cell - - - class LLTM(torch.nn.Module): - def __init__(self, input_features, state_size): - super(LLTM, self).__init__() - self.input_features = input_features - self.state_size = state_size - self.weights = torch.nn.Parameter( - torch.empty(3 * state_size, input_features + state_size)) - self.bias = torch.nn.Parameter(torch.empty(3 * state_size)) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.state_size) - for weight in self.parameters(): - weight.data.uniform_(-stdv, +stdv) - - def forward(self, input, state): - return LLTMFunction.apply(input, self.weights, self.bias, *state) - -Performance Comparison -********************** - -Now that we are able to use and call our C++ code from PyTorch, we can run a -small benchmark to see how much performance we gained from rewriting our op in -C++. We'll run the LLTM forwards and backwards a few times and measure the -duration:: - - import time - - import torch - - batch_size = 16 - input_features = 32 - state_size = 128 - - X = torch.randn(batch_size, input_features) - h = torch.randn(batch_size, state_size) - C = torch.randn(batch_size, state_size) - - rnn = LLTM(input_features, state_size) - - forward = 0 - backward = 0 - for _ in range(100000): - start = time.time() - new_h, new_C = rnn(X, (h, C)) - forward += time.time() - start - - start = time.time() - (new_h.sum() + new_C.sum()).backward() - backward += time.time() - start - - print('Forward: {:.3f} s | Backward {:.3f} s'.format(forward, backward)) - -If we run this code with the original LLTM we wrote in pure Python at the start -of this post, we get the following numbers (on my machine):: - - Forward: 506.480 us | Backward 444.694 us - -and with our new C++ version:: - - Forward: 349.335 us | Backward 443.523 us - -We can already see a significant speedup for the forward function (more than -30%). For the backward function, a speedup is visible, albeit not a major one. -The backward pass I wrote above was not particularly optimized and could -definitely be improved. Also, PyTorch's automatic differentiation engine can -automatically parallelize computation graphs, may use a more efficient flow of -operations overall, and is also implemented in C++, so it's expected to be -fast. Nevertheless, this is a good start. - -Performance on GPU Devices -************************** - -A wonderful fact about PyTorch's *ATen* backend is that it abstracts the -computing device you are running on. This means the same code we wrote for CPU -can *also* run on GPU, and individual operations will correspondingly dispatch -to GPU-optimized implementations. For certain operations like matrix multiply -(like ``mm`` or ``addmm``), this is a big win. Let's take a look at how much -performance we gain from running our C++ code with CUDA tensors. No changes to -our implementation are required, we simply need to put our tensors in GPU -memory from Python, with either adding ``device=cuda_device`` argument at -creation time or using ``.to(cuda_device)`` after creation:: - - import torch - - assert torch.cuda.is_available() - cuda_device = torch.device("cuda") # device object representing GPU - - batch_size = 16 - input_features = 32 - state_size = 128 - - # Note the device=cuda_device arguments here - X = torch.randn(batch_size, input_features, device=cuda_device) - h = torch.randn(batch_size, state_size, device=cuda_device) - C = torch.randn(batch_size, state_size, device=cuda_device) - - rnn = LLTM(input_features, state_size).to(cuda_device) - - forward = 0 - backward = 0 - for _ in range(100000): - start = time.time() - new_h, new_C = rnn(X, (h, C)) - torch.cuda.synchronize() - forward += time.time() - start - - start = time.time() - (new_h.sum() + new_C.sum()).backward() - torch.cuda.synchronize() - backward += time.time() - start - - print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5)) - -Once more comparing our plain PyTorch code with our C++ version, now both -running on CUDA devices, we again see performance gains. For Python/PyTorch:: - - Forward: 187.719 us | Backward 410.815 us - -And C++/ATen:: - - Forward: 149.802 us | Backward 393.458 us - -That's a great overall speedup compared to non-CUDA code. However, we can pull -even more performance out of our C++ code by writing custom CUDA kernels, which -we'll dive into soon. Before that, let's discuss another way of building your C++ -extensions. - -JIT Compiling Extensions -^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, I mentioned there were two ways of building C++ extensions: using -:mod:`setuptools` or just in time (JIT). Having covered the former, let's -elaborate on the latter. The JIT compilation mechanism provides you with a way -of compiling and loading your extensions on the fly by calling a simple -function in PyTorch's API called :func:`torch.utils.cpp_extension.load`. For -the LLTM, this would look as simple as this:: - - from torch.utils.cpp_extension import load - - lltm_cpp = load(name="lltm_cpp", sources=["lltm.cpp"]) - -Here, we provide the function with the same information as for -:mod:`setuptools`. In the background, this will do the following: - -1. Create a temporary directory ``/tmp/torch_extensions/lltm``, -2. Emit a `Ninja `_ build file into that temporary directory, -3. Compile your source files into a shared library, -4. Import this shared library as a Python module. - -In fact, if you pass ``verbose=True`` to :func:`cpp_extension.load`, you will -be informed about the process:: - - Using /tmp/torch_extensions as PyTorch extensions root... - Emitting ninja build file /tmp/torch_extensions/lltm_cpp/build.ninja... - Building extension module lltm_cpp... - Loading extension module lltm_cpp... - -The resulting Python module will be exactly the same as produced by setuptools, -but removes the requirement of having to maintain a separate ``setup.py`` build -file. If your setup is more complicated and you do need the full power of -:mod:`setuptools`, you *can* write your own ``setup.py`` -- but in many cases -this JIT technique will do just fine. The first time you run through this line, -it will take some time, as the extension is compiling in the background. Since -we use the Ninja build system to build your sources, re-compilation is -incremental and thus re-loading the extension when you run your Python module a -second time is fast and has low overhead if you didn't change the extension's -source files. - -Writing a Mixed C++/CUDA extension ----------------------------------- - -To really take our implementation to the next level, we can hand-write parts of -our forward and backward passes with custom CUDA kernels. For the LLTM, this has -the prospect of being particularly effective, as there are a large number of -pointwise operations in sequence, that can all be fused and parallelized in a -single CUDA kernel. Let's see how we could write such a CUDA kernel and -integrate it with PyTorch using this extension mechanism. - -The general strategy for writing a CUDA extension is to first write a C++ file -which defines the functions that will be called from Python, and binds those -functions to Python with pybind11. Furthermore, this file will also *declare* -functions that are defined in CUDA (``.cu``) files. The C++ functions will then -do some checks and ultimately forward its calls to the CUDA functions. In the -CUDA files, we write our actual CUDA kernels. The :mod:`cpp_extension` package -will then take care of compiling the C++ sources with a C++ compiler like -``gcc`` and the CUDA sources with NVIDIA's ``nvcc`` compiler. This ensures that -each compiler takes care of files it knows best to compile. Ultimately, they -will be linked into one shared library that is available to us from Python -code. - -We'll start with the C++ file, which we'll call ``lltm_cuda.cpp``, for example: - -.. code-block:: cpp - - #include - - #include - - // CUDA forward declarations - - std::vector lltm_cuda_forward( - torch::Tensor input, - torch::Tensor weights, - torch::Tensor bias, - torch::Tensor old_h, - torch::Tensor old_cell); - - std::vector lltm_cuda_backward( - torch::Tensor grad_h, - torch::Tensor grad_cell, - torch::Tensor new_cell, - torch::Tensor input_gate, - torch::Tensor output_gate, - torch::Tensor candidate_cell, - torch::Tensor X, - torch::Tensor gate_weights, - torch::Tensor weights); - - // C++ interface - - #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") - #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") - #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - - std::vector lltm_forward( - torch::Tensor input, - torch::Tensor weights, - torch::Tensor bias, - torch::Tensor old_h, - torch::Tensor old_cell) { - CHECK_INPUT(input); - CHECK_INPUT(weights); - CHECK_INPUT(bias); - CHECK_INPUT(old_h); - CHECK_INPUT(old_cell); - - return lltm_cuda_forward(input, weights, bias, old_h, old_cell); - } - - std::vector lltm_backward( - torch::Tensor grad_h, - torch::Tensor grad_cell, - torch::Tensor new_cell, - torch::Tensor input_gate, - torch::Tensor output_gate, - torch::Tensor candidate_cell, - torch::Tensor X, - torch::Tensor gate_weights, - torch::Tensor weights) { - CHECK_INPUT(grad_h); - CHECK_INPUT(grad_cell); - CHECK_INPUT(input_gate); - CHECK_INPUT(output_gate); - CHECK_INPUT(candidate_cell); - CHECK_INPUT(X); - CHECK_INPUT(gate_weights); - CHECK_INPUT(weights); - - return lltm_cuda_backward( - grad_h, - grad_cell, - new_cell, - input_gate, - output_gate, - candidate_cell, - X, - gate_weights, - weights); - } - - PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &lltm_forward, "LLTM forward (CUDA)"); - m.def("backward", &lltm_backward, "LLTM backward (CUDA)"); - } - -As you can see, it is largely boilerplate, checks and forwarding to functions -that we'll define in the CUDA file. We'll name this file -``lltm_cuda_kernel.cu`` (note the ``.cu`` extension!). NVCC can reasonably -compile C++11, thus we still have ATen and the C++ standard library available -to us (but not ``torch.h``). Note that :mod:`setuptools` cannot handle files -with the same name but different extensions, so if you use the ``setup.py`` -method instead of the JIT method, you must give your CUDA file a different name -than your C++ file (for the JIT method, ``lltm.cpp`` and ``lltm.cu`` would work -fine). Let's take a small peek at what this file will look like: - -.. code-block:: cpp - - #include - - #include - #include - - #include - - template - __device__ __forceinline__ scalar_t sigmoid(scalar_t z) { - return 1.0 / (1.0 + exp(-z)); - } - -Here we see the headers I just described, as well as the fact that we are using -CUDA-specific declarations like ``__device__`` and ``__forceinline__`` and -functions like ``exp``. Let's continue with a few more helper functions that -we'll need: - -.. code-block:: cpp - - template - __device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) { - const auto s = sigmoid(z); - return (1.0 - s) * s; - } - - template - __device__ __forceinline__ scalar_t d_tanh(scalar_t z) { - const auto t = tanh(z); - return 1 - (t * t); - } - - template - __device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) { - return fmax(0.0, z) + fmin(0.0, alpha * (exp(z) - 1.0)); - } - - template - __device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) { - const auto e = exp(z); - const auto d_relu = z < 0.0 ? 0.0 : 1.0; - return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0); - } - -To now actually implement a function, we'll again need two things: one function -that performs operations we don't wish to explicitly write by hand and calls -into CUDA kernels, and then the actual CUDA kernel for the parts we want to -speed up. For the forward pass, the first function should look like this: - -.. code-block:: cpp - - std::vector lltm_cuda_forward( - torch::Tensor input, - torch::Tensor weights, - torch::Tensor bias, - torch::Tensor old_h, - torch::Tensor old_cell) { - auto X = torch::cat({old_h, input}, /*dim=*/1); - auto gates = torch::addmm(bias, X, weights.transpose(0, 1)); - - const auto batch_size = old_cell.size(0); - const auto state_size = old_cell.size(1); - - auto new_h = torch::zeros_like(old_cell); - auto new_cell = torch::zeros_like(old_cell); - auto input_gate = torch::zeros_like(old_cell); - auto output_gate = torch::zeros_like(old_cell); - auto candidate_cell = torch::zeros_like(old_cell); - - const int threads = 1024; - const dim3 blocks((state_size + threads - 1) / threads, batch_size); - - AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] { - lltm_cuda_forward_kernel<<>>( - gates.data(), - old_cell.data(), - new_h.data(), - new_cell.data(), - input_gate.data(), - output_gate.data(), - candidate_cell.data(), - state_size); - })); - - return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates}; - } - -The main point of interest here is the ``AT_DISPATCH_FLOATING_TYPES`` macro and -the kernel launch (indicated by the ``<<<...>>>``). While ATen abstracts away -the device and datatype of the tensors we deal with, a tensor will, at runtime, -still be backed by memory of a concrete type on a concrete device. As such, we -need a way of determining at runtime what type a tensor is and then selectively -call functions with the corresponding correct type signature. Done manually, -this would (conceptually) look something like this: - -.. code-block:: cpp - - switch (tensor.type().scalarType()) { - case torch::ScalarType::Double: - return function(tensor.data()); - case torch::ScalarType::Float: - return function(tensor.data()); - ... - } - -The purpose of ``AT_DISPATCH_FLOATING_TYPES`` is to take care of this dispatch -for us. It takes a type (``gates.type()`` in our case), a name (for error -messages) and a lambda function. Inside this lambda function, the type alias -``scalar_t`` is available and is defined as the type that the tensor actually -is at runtime in that context. As such, if we have a template function (which -our CUDA kernel will be), we can instantiate it with this ``scalar_t`` alias, -and the correct function will be called. In this case, we also want to retrieve -the data pointers of the tensors as pointers of that ``scalar_t`` type. If you -wanted to dispatch over all types and not just floating point types (``Float`` -and ``Double``), you can use ``AT_DISPATCH_ALL_TYPES``. - -Note that we perform some operations with plain ATen. These operations will -still run on the GPU, but using ATen's default implementations. This makes -sense because ATen will use highly optimized routines for things like matrix -multiplies (e.g. ``addmm``) or convolutions which would be much harder to -implement and improve ourselves. - -As for the kernel launch itself, we are here specifying that each CUDA block -will have 1024 threads, and that the entire GPU grid is split into as many -blocks of ``1 x 1024`` threads as are required to fill our matrices with one -thread per component. For example, if our state size was 2048 and our batch -size 4, we'd launch a total of ``4 x 2 = 8`` blocks with each 1024 threads. If -you've never heard of CUDA "blocks" or "grids" before, an `introductory read -about CUDA `_ may -help. - -The actual CUDA kernel is fairly simple (if you've ever programmed GPUs before): - -.. code-block:: cpp - - template - __global__ void lltm_cuda_forward_kernel( - const scalar_t* __restrict__ gates, - const scalar_t* __restrict__ old_cell, - scalar_t* __restrict__ new_h, - scalar_t* __restrict__ new_cell, - scalar_t* __restrict__ input_gate, - scalar_t* __restrict__ output_gate, - scalar_t* __restrict__ candidate_cell, - size_t state_size) { - const int column = blockIdx.x * blockDim.x + threadIdx.x; - const int index = blockIdx.y * state_size + column; - const int gates_row = blockIdx.y * (state_size * 3); - if (column < state_size) { - input_gate[index] = sigmoid(gates[gates_row + column]); - output_gate[index] = sigmoid(gates[gates_row + state_size + column]); - candidate_cell[index] = elu(gates[gates_row + 2 * state_size + column]); - new_cell[index] = - old_cell[index] + candidate_cell[index] * input_gate[index]; - new_h[index] = tanh(new_cell[index]) * output_gate[index]; - } - } - -What's primarily interesting here is that we are able to compute all of these -pointwise operations entirely in parallel for each individual component in our -gate matrices. If you imagine having to do this with a giant ``for`` loop over -a million elements in serial, you can see why this would be much faster. - -Using accessors -^^^^^^^^^^^^^^^ - -You can see in the CUDA kernel that we work directly on pointers with the right -type. Indeed, working directly with high level type agnostic tensors inside cuda -kernels would be very inefficient. - -However, this comes at a cost of ease of use and readability, especially for -highly dimensional data. In our example, we know for example that the contiguous -``gates`` tensor has 3 dimensions: - -1. batch, size of ``batch_size`` and stride of ``3*state_size`` -2. row, size of ``3`` and stride of ``state_size`` -3. index, size of ``state_size`` and stride of ``1`` - -How can we access the element ``gates[n][row][column]`` inside the kernel then? -It turns out that you need the strides to access your element with some simple -arithmetic. - -.. code-block:: cpp - - gates.data()[n*3*state_size + row*state_size + column] - - -In addition to being verbose, this expression needs stride to be explicitly -known, and thus passed to the kernel function within its arguments. You can see -that in the case of kernel functions accepting multiple tensors with different -sizes you will end up with a very long list of arguments. - -Fortunately for us, ATen provides accessors that are created with a single -dynamic check that a Tensor is the type and number of dimensions. -Accessors then expose an API for accessing the Tensor elements efficiently -without having to convert to a single pointer: - -.. code-block:: cpp - - torch::Tensor foo = torch::rand({12, 12}); - - // assert foo is 2-dimensional and holds floats. - auto foo_a = foo.accessor(); - float trace = 0; - - for(int i = 0; i < foo_a.size(0); i++) { - // use the accessor foo_a to get tensor data. - trace += foo_a[i][i]; - } - -Accessor objects have a relatively high level interface, with ``.size()`` and -``.stride()`` methods and multi-dimensional indexing. The ``.accessor<>`` -interface is designed to access data efficiently on cpu tensor. The equivalent -for cuda tensors are ``packed_accessor64<>`` and ``packed_accessor32<>``, which -produce Packed Accessors with either 64-bit or 32-bit integer indexing. - -The fundamental difference with Accessor is that a Packed Accessor copies size -and stride data inside of its structure instead of pointing to it. It allows us -to pass it to a CUDA kernel function and use its interface inside it. - -We can design a function that takes Packed Accessors instead of pointers. - -.. code-block:: cpp - - __global__ void lltm_cuda_forward_kernel( - const torch::PackedTensorAccessor32 gates, - const torch::PackedTensorAccessor32 old_cell, - torch::PackedTensorAccessor32 new_h, - torch::PackedTensorAccessor32 new_cell, - torch::PackedTensorAccessor32 input_gate, - torch::PackedTensorAccessor32 output_gate, - torch::PackedTensorAccessor32 candidate_cell) - -Let's decompose the template used here. the first two arguments ``scalar_t`` and -``2`` are the same as regular Accessor. The argument -``torch::RestrictPtrTraits`` indicates that the ``__restrict__`` keyword must be -used. Note also that we've used the ``PackedAccessor32`` variant which store the -sizes and strides in an ``int32_t``. This is important as using the 64-bit -variant (``PackedAccessor64``) can make the kernel slower. - -The function declaration becomes - -.. code-block:: cpp - - template - __global__ void lltm_cuda_forward_kernel( - const torch::PackedTensorAccessor32 gates, - const torch::PackedTensorAccessor32 old_cell, - torch::PackedTensorAccessor32 new_h, - torch::PackedTensorAccessor32 new_cell, - torch::PackedTensorAccessor32 input_gate, - torch::PackedTensorAccessor32 output_gate, - torch::PackedTensorAccessor32 candidate_cell) { - //batch index - const int n = blockIdx.y; - // column index - const int c = blockIdx.x * blockDim.x + threadIdx.x; - if (c < gates.size(2)){ - input_gate[n][c] = sigmoid(gates[n][0][c]); - output_gate[n][c] = sigmoid(gates[n][1][c]); - candidate_cell[n][c] = elu(gates[n][2][c]); - new_cell[n][c] = - old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c]; - new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c]; - } - } - -The implementation is much more readable! This function is then called by -creating Packed Accessors with the ``.packed_accessor32<>`` method within the -host function. - -.. code-block:: cpp - - std::vector lltm_cuda_forward( - torch::Tensor input, - torch::Tensor weights, - torch::Tensor bias, - torch::Tensor old_h, - torch::Tensor old_cell) { - auto X = torch::cat({old_h, input}, /*dim=*/1); - auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); - - const auto batch_size = old_cell.size(0); - const auto state_size = old_cell.size(1); - - auto gates = gate_weights.reshape({batch_size, 3, state_size}); - auto new_h = torch::zeros_like(old_cell); - auto new_cell = torch::zeros_like(old_cell); - auto input_gate = torch::zeros_like(old_cell); - auto output_gate = torch::zeros_like(old_cell); - auto candidate_cell = torch::zeros_like(old_cell); - - const int threads = 1024; - const dim3 blocks((state_size + threads - 1) / threads, batch_size); - - AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] { - lltm_cuda_forward_kernel<<>>( - gates.packed_accessor32(), - old_cell.packed_accessor32(), - new_h.packed_accessor32(), - new_cell.packed_accessor32(), - input_gate.packed_accessor32(), - output_gate.packed_accessor32(), - candidate_cell.packed_accessor32()); - })); - - return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates}; - } - -The backwards pass follows much the same pattern and I won't elaborate further -on it: - -.. code-block:: cpp - - template - __global__ void lltm_cuda_backward_kernel( - torch::PackedTensorAccessor32 d_old_cell, - torch::PackedTensorAccessor32 d_gates, - const torch::PackedTensorAccessor32 grad_h, - const torch::PackedTensorAccessor32 grad_cell, - const torch::PackedTensorAccessor32 new_cell, - const torch::PackedTensorAccessor32 input_gate, - const torch::PackedTensorAccessor32 output_gate, - const torch::PackedTensorAccessor32 candidate_cell, - const torch::PackedTensorAccessor32 gate_weights) { - //batch index - const int n = blockIdx.y; - // column index - const int c = blockIdx.x * blockDim.x + threadIdx.x; - if (c < d_gates.size(2)){ - const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c]; - const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c]; - const auto d_new_cell = - d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c]; - - - d_old_cell[n][c] = d_new_cell; - const auto d_candidate_cell = input_gate[n][c] * d_new_cell; - const auto d_input_gate = candidate_cell[n][c] * d_new_cell; - - d_gates[n][0][c] = - d_input_gate * d_sigmoid(gate_weights[n][0][c]); - d_gates[n][1][c] = - d_output_gate * d_sigmoid(gate_weights[n][1][c]); - d_gates[n][2][c] = - d_candidate_cell * d_elu(gate_weights[n][2][c]); - } - } - - std::vector lltm_cuda_backward( - torch::Tensor grad_h, - torch::Tensor grad_cell, - torch::Tensor new_cell, - torch::Tensor input_gate, - torch::Tensor output_gate, - torch::Tensor candidate_cell, - torch::Tensor X, - torch::Tensor gates, - torch::Tensor weights) { - auto d_old_cell = torch::zeros_like(new_cell); - auto d_gates = torch::zeros_like(gates); - - const auto batch_size = new_cell.size(0); - const auto state_size = new_cell.size(1); - - const int threads = 1024; - const dim3 blocks((state_size + threads - 1) / threads, batch_size); - - AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_backward_cuda", ([&] { - lltm_cuda_backward_kernel<<>>( - d_old_cell.packed_accessor32(), - d_gates.packed_accessor32(), - grad_h.packed_accessor32(), - grad_cell.packed_accessor32(), - new_cell.packed_accessor32(), - input_gate.packed_accessor32(), - output_gate.packed_accessor32(), - candidate_cell.packed_accessor32(), - gates.packed_accessor32()); - })); - - auto d_gate_weights = d_gates.reshape({batch_size, 3*state_size}); - auto d_weights = d_gate_weights.t().mm(X); - auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true); - - auto d_X = d_gate_weights.mm(weights); - auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); - auto d_input = d_X.slice(/*dim=*/1, state_size); - - return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates}; - } - - -Integrating a C++/CUDA Operation with PyTorch -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Integration of our CUDA-enabled op with PyTorch is again very straightforward. -If you want to write a ``setup.py`` script, it could look like this:: - - from setuptools import setup - from torch.utils.cpp_extension import BuildExtension, CUDAExtension - - setup( - name='lltm', - ext_modules=[ - CUDAExtension('lltm_cuda', [ - 'lltm_cuda.cpp', - 'lltm_cuda_kernel.cu', - ]) - ], - cmdclass={ - 'build_ext': BuildExtension - }) - -Instead of :func:`CppExtension`, we now use :func:`CUDAExtension`. We can just -specify the ``.cu`` file along with the ``.cpp`` files -- the library takes -care of all the hassle this entails for you. The JIT mechanism is even -simpler:: - - from torch.utils.cpp_extension import load - - lltm = load(name='lltm', sources=['lltm_cuda.cpp', 'lltm_cuda_kernel.cu']) - -Performance Comparison -********************** - -Our hope was that parallelizing and fusing the pointwise operations of our code -with CUDA would improve the performance of our LLTM. Let's see if that holds -true. We can run the code I listed earlier to run a benchmark. Our fastest -version earlier was the CUDA-based C++ code:: - - Forward: 149.802 us | Backward 393.458 us - - -And now with our custom CUDA kernel:: - - Forward: 129.431 us | Backward 304.641 us - -More performance increases! - -Conclusion ----------- - -You should now be equipped with a good overview of PyTorch's C++ extension -mechanism as well as a motivation for using them. You can find the code -examples displayed in this note `here -`_. If you have questions, please use -`the forums `_. Also be sure to check our `FAQ -`_ in case you run into any issues. diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst index 901658183c7..968afa01b23 100644 --- a/advanced_source/cpp_frontend.rst +++ b/advanced_source/cpp_frontend.rst @@ -1,11 +1,31 @@ +.. _cpp-frontend-tutorial: + Using the PyTorch C++ Frontend ============================== +**Author:** `Peter Goldsborough `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to build a C++ application that utilizes the PyTorch C++ frontend + * How to define and train neural networks from C++ using PyTorch abstractions + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 1.5 or later + * Basic understanding of C++ programming + * Basic Ubuntu Linux environment with CMake >= 3.5; similar commands will work in a MacOS / Windows environment + * (Optional) A CUDA-based GPU for the GPU training sections + The PyTorch C++ frontend is a pure C++ interface to the PyTorch machine learning framework. While the primary interface to PyTorch naturally is Python, this Python API sits atop a substantial C++ codebase providing foundational data structures and functionality such as tensors and automatic differentiation. The -C++ frontend exposes a pure C++11 API that extends this underlying C++ codebase +C++ frontend exposes a pure C++17 API that extends this underlying C++ codebase with tools required for machine learning training and inference. This includes a built-in collection of common components for neural network modeling; an API to extend this collection with custom modules; a library of popular optimization @@ -57,7 +77,7 @@ the right tool for the job. Examples for such environments include: Multiprocessing is an alternative, but not as scalable and has significant shortcomings. C++ has no such constraints and threads are easy to use and create. Models requiring heavy parallelization, like those used in `Deep - Neuroevolution `_, can benefit from + Neuroevolution `_, can benefit from this. - **Existing C++ Codebases**: You may be the owner of an existing C++ application doing anything from serving web pages in a backend server to @@ -137,14 +157,14 @@ on we'll use this ``CMakeLists.txt`` file: .. code-block:: cmake - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) + cmake_minimum_required(VERSION 3.5 FATAL_ERROR) project(dcgan) find_package(Torch REQUIRED) add_executable(dcgan dcgan.cpp) target_link_libraries(dcgan "${TORCH_LIBRARIES}") - set_property(TARGET dcgan PROPERTY CXX_STANDARD 14) + set_property(TARGET dcgan PROPERTY CXX_STANDARD 17) .. note:: @@ -662,7 +682,7 @@ Defining the DCGAN Modules We now have the necessary background and introduction to define the modules for the machine learning task we want to solve in this post. To recap: our task is to generate images of digits from the `MNIST dataset -`_. We want to use a `generative adversarial +`_. We want to use a `generative adversarial network (GAN) `_ to solve this task. In particular, we'll use a `DCGAN architecture @@ -859,7 +879,7 @@ stacks them into a single tensor along the first dimension: Note that the MNIST dataset should be located in the ``./mnist`` directory relative to wherever you execute the training binary from. You can use `this -script `_ +script `_ to download the MNIST dataset. Next, we create a data loader and pass it this dataset. To make a new data @@ -969,7 +989,7 @@ the data loader every epoch and then write the GAN training code: discriminator->zero_grad(); torch::Tensor real_images = batch.data; torch::Tensor real_labels = torch::empty(batch.data.size(0)).uniform_(0.8, 1.0); - torch::Tensor real_output = discriminator->forward(real_images); + torch::Tensor real_output = discriminator->forward(real_images).reshape(real_labels.sizes()); torch::Tensor d_loss_real = torch::binary_cross_entropy(real_output, real_labels); d_loss_real.backward(); @@ -977,7 +997,7 @@ the data loader every epoch and then write the GAN training code: torch::Tensor noise = torch::randn({batch.data.size(0), kNoiseSize, 1, 1}); torch::Tensor fake_images = generator->forward(noise); torch::Tensor fake_labels = torch::zeros(batch.data.size(0)); - torch::Tensor fake_output = discriminator->forward(fake_images.detach()); + torch::Tensor fake_output = discriminator->forward(fake_images.detach()).reshape(fake_labels.sizes()); torch::Tensor d_loss_fake = torch::binary_cross_entropy(fake_output, fake_labels); d_loss_fake.backward(); @@ -987,7 +1007,7 @@ the data loader every epoch and then write the GAN training code: // Train generator. generator->zero_grad(); fake_labels.fill_(1); - fake_output = discriminator->forward(fake_images); + fake_output = discriminator->forward(fake_images).reshape(fake_labels.sizes()); torch::Tensor g_loss = torch::binary_cross_entropy(fake_output, fake_labels); g_loss.backward(); generator_optimizer.step(); diff --git a/advanced_source/custom_class_pt2.rst b/advanced_source/custom_class_pt2.rst new file mode 100644 index 00000000000..229a94f2ce9 --- /dev/null +++ b/advanced_source/custom_class_pt2.rst @@ -0,0 +1,275 @@ +Supporting Custom C++ Classes in torch.compile/torch.export +=========================================================== + + +This tutorial is a follow-on to the +:doc:`custom C++ classes ` tutorial, and +introduces additional steps that are needed to support custom C++ classes in +torch.compile/torch.export. + +.. warning:: + + This feature is in prototype status and is subject to backwards compatibility + breaking changes. This tutorial provides a snapshot as of PyTorch 2.8. If + you run into any issues, please reach out to us on Github! + +Concretely, there are a few steps: + +1. Implement an ``__obj_flatten__`` method to the C++ custom class + implementation to allow us to inspect its states and guard the changes. The + method should return a tuple of tuple of attribute_name, value + (``tuple[tuple[str, value] * n]``). + +2. Register a python fake class using ``@torch._library.register_fake_class`` + + a. Implement “fake methods” of each of the class’s c++ methods, which should + have the same schema as the C++ implementation. + + b. Additionally, implement an ``__obj_unflatten__`` classmethod in the Python + fake class to tell us how to create a fake class from the flattened + states returned by ``__obj_flatten__``. + +Here is a breakdown of the diff. Following the guide in +:doc:`Extending TorchScript with Custom C++ Classes `, +we can create a thread-safe tensor queue and build it. + +.. code-block:: cpp + + // Thread-safe Tensor Queue + + #include + #include + + #include + #include + #include + + using namespace torch::jit; + + // Thread-safe Tensor Queue + struct TensorQueue : torch::CustomClassHolder { + explicit TensorQueue(at::Tensor t) : init_tensor_(t) {} + + explicit TensorQueue(c10::Dict dict) { + init_tensor_ = dict.at(std::string("init_tensor")); + const std::string key = "queue"; + at::Tensor size_tensor; + size_tensor = dict.at(std::string(key + "/size")).cpu(); + const auto* size_tensor_acc = size_tensor.const_data_ptr(); + int64_t queue_size = size_tensor_acc[0]; + + for (const auto index : c10::irange(queue_size)) { + at::Tensor val; + queue_[index] = dict.at(key + "/" + std::to_string(index)); + queue_.push_back(val); + } + } + + // Push the element to the rear of queue. + // Lock is added for thread safe. + void push(at::Tensor x) { + std::lock_guard guard(mutex_); + queue_.push_back(x); + } + // Pop the front element of queue and return it. + // If empty, return init_tensor_. + // Lock is added for thread safe. + at::Tensor pop() { + std::lock_guard guard(mutex_); + if (!queue_.empty()) { + auto val = queue_.front(); + queue_.pop_front(); + return val; + } else { + return init_tensor_; + } + } + + std::vector get_raw_queue() { + std::vector raw_queue(queue_.begin(), queue_.end()); + return raw_queue; + } + + private: + std::deque queue_; + std::mutex mutex_; + at::Tensor init_tensor_; + }; + + // The torch binding code + TORCH_LIBRARY(MyCustomClass, m) { + m.class_("TensorQueue") + .def(torch::init()) + .def("push", &TensorQueue::push) + .def("pop", &TensorQueue::pop) + .def("get_raw_queue", &TensorQueue::get_raw_queue); + } + +**Step 1**: Add an ``__obj_flatten__`` method to the C++ custom class implementation: + +.. code-block:: cpp + + // Thread-safe Tensor Queue + struct TensorQueue : torch::CustomClassHolder { + ... + std::tuple>, std::tuple> __obj_flatten__() { + return std::tuple(std::tuple("queue", this->get_raw_queue()), std::tuple("init_tensor_", this->init_tensor_.clone())); + } + ... + }; + + TORCH_LIBRARY(MyCustomClass, m) { + m.class_("TensorQueue") + .def(torch::init()) + ... + .def("__obj_flatten__", &TensorQueue::__obj_flatten__); + } + +**Step 2a**: Register a fake class in Python that implements each method. + +.. code-block:: python + + # namespace::class_name + @torch._library.register_fake_class("MyCustomClass::TensorQueue") + class FakeTensorQueue: + def __init__( + self, + queue: List[torch.Tensor], + init_tensor_: torch.Tensor + ) -> None: + self.queue = queue + self.init_tensor_ = init_tensor_ + + def push(self, tensor: torch.Tensor) -> None: + self.queue.append(tensor) + + def pop(self) -> torch.Tensor: + if len(self.queue) > 0: + return self.queue.pop(0) + return self.init_tensor_ + +**Step 2b**: Implement an ``__obj_unflatten__`` classmethod in Python. + +.. code-block:: python + + # namespace::class_name + @torch._library.register_fake_class("MyCustomClass::TensorQueue") + class FakeTensorQueue: + ... + @classmethod + def __obj_unflatten__(cls, flattened_tq): + return cls(**dict(flattened_tq)) + + +That’s it! Now we can create a module that uses this object and run it with ``torch.compile`` or ``torch.export``. + +.. code-block:: python + + import torch + + torch.classes.load_library("build/libcustom_class.so") + tq = torch.classes.MyCustomClass.TensorQueue(torch.empty(0).fill_(-1)) + + class Mod(torch.nn.Module): + def forward(self, tq, x): + tq.push(x.sin()) + tq.push(x.cos()) + poped_t = tq.pop() + assert torch.allclose(poped_t, x.sin()) + return tq, poped_t + + tq, poped_t = torch.compile(Mod(), backend="eager", fullgraph=True)(tq, torch.randn(2, 3)) + assert tq.size() == 1 + + exported_program = torch.export.export(Mod(), (tq, torch.randn(2, 3),), strict=False) + exported_program.module()(tq, torch.randn(2, 3)) + +We can also implement custom ops that take custom classes as inputs. For +example, we could register a custom op ``for_each_add_(tq, tensor)`` + +.. code-block:: cpp + + struct TensorQueue : torch::CustomClassHolder { + ... + void for_each_add_(at::Tensor inc) { + for (auto& t : queue_) { + t.add_(inc); + } + } + ... + } + + + TORCH_LIBRARY_FRAGMENT(MyCustomClass, m) { + m.class_("TensorQueue") + ... + .def("for_each_add_", &TensorQueue::for_each_add_); + + m.def( + "for_each_add_(__torch__.torch.classes.MyCustomClass.TensorQueue foo, Tensor inc) -> ()"); + } + + void for_each_add_(c10::intrusive_ptr tq, at::Tensor inc) { + tq->for_each_add_(inc); + } + + TORCH_LIBRARY_IMPL(MyCustomClass, CPU, m) { + m.impl("for_each_add_", for_each_add_); + } + + +Since the fake class is implemented in python, we require the fake +implementation of custom op must also be registered in python: + +.. code-block:: python + + @torch.library.register_fake("MyCustomClass::for_each_add_") + def fake_for_each_add_(tq, inc): + tq.for_each_add_(inc) + +After re-compilation, we can export the custom op with: + +.. code-block:: python + + class ForEachAdd(torch.nn.Module): + def forward(self, tq: torch.ScriptObject, a: torch.Tensor) -> torch.ScriptObject: + torch.ops.MyCustomClass.for_each_add_(tq, a) + return tq + + mod = ForEachAdd() + tq = empty_tensor_queue() + qlen = 10 + for i in range(qlen): + tq.push(torch.zeros(1)) + + ep = torch.export.export(mod, (tq, torch.ones(1)), strict=False) + +Why do we need to make a Fake Class? +------------------------------------ + +Tracing with real custom object has several major downsides: + +1. Operators on real objects can be time consuming e.g. the custom object + might be reading from the network or loading data from the disk. + +2. We don’t want to mutate the real custom object or create side-effects to the environment while tracing. + +3. It cannot support dynamic shapes. + +However, it may be difficult for users to write a fake class, e.g. if the +original class uses some third-party library that determines the output shape of +the methods, or is complicated and written by others. In such cases, users can +disable the fakification requirement by defining a ``tracing_mode`` method to +return ``"real"``: + +.. code-block:: cpp + + std::string tracing_mode() { + return "real"; + } + + +A caveat of fakification is regarding **tensor aliasing.** We assume that no +tensors within a torchbind object aliases a tensor outside of the torchbind +object. Therefore, mutating one of these tensors will result in undefined +behavior. diff --git a/advanced_source/torch_script_custom_classes.rst b/advanced_source/custom_classes.rst similarity index 51% rename from advanced_source/torch_script_custom_classes.rst rename to advanced_source/custom_classes.rst index cccb86ff4ce..014bac2eebf 100644 --- a/advanced_source/torch_script_custom_classes.rst +++ b/advanced_source/custom_classes.rst @@ -1,10 +1,9 @@ -Extending TorchScript with Custom C++ Classes +Extending PyTorch with Custom C++ Classes =============================================== -This tutorial is a follow-on to the -:doc:`custom operator ` -tutorial, and introduces the API we've built for binding C++ classes into TorchScript -and Python simultaneously. The API is very similar to + +This tutorial introduces an API for binding C++ classes into PyTorch. +The API is very similar to `pybind11 `_, and most of the concepts will transfer over if you're familiar with that system. @@ -14,14 +13,14 @@ Implementing and Binding the Class in C++ For this tutorial, we are going to define a simple C++ class that maintains persistent state in a member variable. -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp :language: cpp :start-after: BEGIN class :end-before: END class There are several things to note: -- ``torch/custom_class.h`` is the header you need to include to extend TorchScript +- ``torch/custom_class.h`` is the header you need to include to extend PyTorch with your custom class. - Notice that whenever we are working with instances of the custom class, we do it via instances of ``c10::intrusive_ptr<>``. Think of ``intrusive_ptr`` @@ -34,10 +33,10 @@ There are several things to note: ``torch::CustomClassHolder``. This ensures that the custom class has space to store the reference count. -Now let's take a look at how we will make this class visible to TorchScript, a process called +Now let's take a look at how we will make this class visible to PyTorch, a process called *binding* the class: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp :language: cpp :start-after: BEGIN binding :end-before: END binding @@ -56,7 +55,7 @@ we've covered so far and place it in a file called ``class.cpp``. Then, write a simple ``CMakeLists.txt`` file and place it in the same directory. Here is what ``CMakeLists.txt`` should look like: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/CMakeLists.txt :language: cmake Also, create a ``build`` directory. Your file tree should look like this:: @@ -66,8 +65,6 @@ Also, create a ``build`` directory. Your file tree should look like this:: CMakeLists.txt build/ -We assume you've setup your environment in the same way as described in -the :doc:`previous tutorial `. Go ahead and invoke cmake and then make to build the project: .. code-block:: shell @@ -117,137 +114,16 @@ file present in the build directory. On Linux, this is probably named build/ libcustom_class.so -Using the C++ Class from Python and TorchScript +Using the C++ Class from Python ----------------------------------------------- Now that we have our class and its registration compiled into an ``.so`` file, we can load that `.so` into Python and try it out. Here's a script that demonstrates that: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py - :language: python - - -Saving, Loading, and Running TorchScript Code Using Custom Classes ------------------------------------------------------------------- - -We can also use custom-registered C++ classes in a C++ process using -libtorch. As an example, let's define a simple ``nn.Module`` that -instantiates and calls a method on our MyStackClass class: - -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/save.py +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/custom_test.py :language: python -``foo.pt`` in our filesystem now contains the serialized TorchScript -program we've just defined. - -Now, we're going to define a new CMake project to show how you can load -this model and its required .so file. For a full treatment of how to do this, -please have a look at the `Loading a TorchScript Model in C++ Tutorial `_. - -Similarly to before, let's create a file structure containing the following:: - - cpp_inference_example/ - infer.cpp - CMakeLists.txt - foo.pt - build/ - custom_class_project/ - class.cpp - CMakeLists.txt - build/ - -Notice we've copied over the serialized ``foo.pt`` file, as well as the source -tree from the ``custom_class_project`` above. We will be adding the -``custom_class_project`` as a dependency to this C++ project so that we can -build the custom class into the binary. - -Let's populate ``infer.cpp`` with the following: - -.. literalinclude:: ../advanced_source/torch_script_custom_classes/infer.cpp - :language: cpp - -And similarly let's define our CMakeLists.txt file: - -.. literalinclude:: ../advanced_source/torch_script_custom_classes/CMakeLists.txt - :language: cpp - -You know the drill: ``cd build``, ``cmake``, and ``make``: - -.. code-block:: shell - - $ cd build - $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. - -- The C compiler identification is GNU 7.3.1 - -- The CXX compiler identification is GNU 7.3.1 - -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc - -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /opt/rh/devtoolset-7/root/usr/bin/c++ - -- Check for working CXX compiler: /opt/rh/devtoolset-7/root/usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Looking for pthread_create - -- Looking for pthread_create - not found - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Found torch: /local/miniconda3/lib/python3.7/site-packages/torch/lib/libtorch.so - -- Configuring done - -- Generating done - -- Build files have been written to: /cpp_inference_example/build - $ make -j - Scanning dependencies of target custom_class - [ 25%] Building CXX object custom_class_project/CMakeFiles/custom_class.dir/class.cpp.o - [ 50%] Linking CXX shared library libcustom_class.so - [ 50%] Built target custom_class - Scanning dependencies of target infer - [ 75%] Building CXX object CMakeFiles/infer.dir/infer.cpp.o - [100%] Linking CXX executable infer - [100%] Built target infer - -And now we can run our exciting C++ binary: - -.. code-block:: shell - - $ ./infer - momfoobarbaz - -Incredible! - -Moving Custom Classes To/From IValues -------------------------------------- - -It's also possible that you may need to move custom classes into or out of -``IValue``s, such as when you take or return ``IValue``s from TorchScript methods -or you want to instantiate a custom class attribute in C++. For creating an -``IValue`` from a custom C++ class instance: - -- ``torch::make_custom_class()`` provides an API similar to c10::intrusive_ptr - in that it will take whatever set of arguments you provide to it, call the constructor - of T that matches that set of arguments, and wrap that instance up and return it. - However, instead of returning just a pointer to a custom class object, it returns - an ``IValue`` wrapping the object. You can then pass this ``IValue`` directly to - TorchScript. -- In the event that you already have an ``intrusive_ptr`` pointing to your class, you - can directly construct an IValue from it using the constructor ``IValue(intrusive_ptr)``. - -For converting ``IValue`` back to custom classes: - -- ``IValue::toCustomClass()`` will return an ``intrusive_ptr`` pointing to the - custom class that the ``IValue`` contains. Internally, this function is checking - that ``T`` is registered as a custom class and that the ``IValue`` does in fact contain - a custom class. You can check whether the ``IValue`` contains a custom class manually by - calling ``isCustomClass()``. Defining Serialization/Deserialization Methods for Custom C++ Classes --------------------------------------------------------------------- @@ -255,7 +131,7 @@ Defining Serialization/Deserialization Methods for Custom C++ Classes If you try to save a ``ScriptModule`` with a custom-bound C++ class as an attribute, you'll get the following error: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/export_attr.py :language: python .. code-block:: shell @@ -263,13 +139,13 @@ an attribute, you'll get the following error: $ python export_attr.py RuntimeError: Cannot serialize custom bound C++ class __torch__.torch.classes.my_classes.MyStackClass. Please define serialization methods via def_pickle for this class. (pushIValueImpl at ../torch/csrc/jit/pickler.cpp:128) -This is because TorchScript cannot automatically figure out what information +This is because PyTorch cannot automatically figure out what information save from your C++ class. You must specify that manually. The way to do that is to define ``__getstate__`` and ``__setstate__`` methods on the class using the special ``def_pickle`` method on ``class_``. .. note:: - The semantics of ``__getstate__`` and ``__setstate__`` in TorchScript are + The semantics of ``__getstate__`` and ``__setstate__`` are equivalent to that of the Python pickle module. You can `read more `_ about how we use these methods. @@ -277,7 +153,7 @@ the special ``def_pickle`` method on ``class_``. Here is an example of the ``def_pickle`` call we can add to the registration of ``MyStackClass`` to include serialization methods: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp :language: cpp :start-after: BEGIN def_pickle :end-before: END def_pickle @@ -303,7 +179,7 @@ Once you've defined a custom C++ class, you can also use that class as an argument or return from a custom operator (i.e. free functions). Suppose you have the following free function: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp :language: cpp :start-after: BEGIN free_function :end-before: END free_function @@ -311,14 +187,11 @@ you have the following free function: You can register it running the following code inside your ``TORCH_LIBRARY`` block: -.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp +.. literalinclude:: ../advanced_source/custom_classes/custom_class_project/class.cpp :language: cpp :start-after: BEGIN def_free :end-before: END def_free -Refer to the `custom op tutorial `_ -for more details on the registration API. - Once this is done, you can use the op like the following example: .. code-block:: python @@ -344,13 +217,12 @@ Once this is done, you can use the op like the following example: Conclusion ---------- -This tutorial walked you through how to expose a C++ class to TorchScript -(and by extension Python), how to register its methods, how to use that -class from Python and TorchScript, and how to save and load code using -the class and run that code in a standalone C++ process. You are now ready -to extend your TorchScript models with C++ classes that interface with -third party C++ libraries or implement any other use case that requires the -lines between Python, TorchScript and C++ to blend smoothly. +This tutorial walked you through how to expose a C++ class to PyTorch, how to +register its methods, how to use that class from Python, and how to save and +load code using the class and run that code in a standalone C++ process. You +are now ready to extend your PyTorch models with C++ classes that interface +with third party C++ libraries or implement any other use case that requires +the lines between Python and C++ to blend smoothly. As always, if you run into any problems or have questions, you can use our `forum `_ or `GitHub issues diff --git a/advanced_source/torch_script_custom_classes/CMakeLists.txt b/advanced_source/custom_classes/CMakeLists.txt similarity index 100% rename from advanced_source/torch_script_custom_classes/CMakeLists.txt rename to advanced_source/custom_classes/CMakeLists.txt diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt b/advanced_source/custom_classes/custom_class_project/CMakeLists.txt similarity index 100% rename from advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt rename to advanced_source/custom_classes/custom_class_project/CMakeLists.txt diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp b/advanced_source/custom_classes/custom_class_project/class.cpp similarity index 100% rename from advanced_source/torch_script_custom_classes/custom_class_project/class.cpp rename to advanced_source/custom_classes/custom_class_project/class.cpp diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py b/advanced_source/custom_classes/custom_class_project/custom_test.py similarity index 98% rename from advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py rename to advanced_source/custom_classes/custom_class_project/custom_test.py index e8c38638f6c..1deda445310 100644 --- a/advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py +++ b/advanced_source/custom_classes/custom_class_project/custom_test.py @@ -22,7 +22,7 @@ # Test custom operator s.push("pushed") torch.ops.my_classes.manipulate_instance(s) # acting as s.pop() -assert s.top() == "bar" +assert s.top() == "bar" # Returning and passing instances of custom classes works as you'd expect s2 = s.clone() @@ -51,4 +51,3 @@ def do_stacks(s: MyStackClass): # We can pass a custom class instance assert top == "wow" for expected in ["wow", "mom", "hi"]: assert stack.pop() == expected - diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py b/advanced_source/custom_classes/custom_class_project/export_attr.py similarity index 100% rename from advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py rename to advanced_source/custom_classes/custom_class_project/export_attr.py diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/save.py b/advanced_source/custom_classes/custom_class_project/save.py similarity index 100% rename from advanced_source/torch_script_custom_classes/custom_class_project/save.py rename to advanced_source/custom_classes/custom_class_project/save.py diff --git a/advanced_source/torch_script_custom_classes/infer.cpp b/advanced_source/custom_classes/infer.cpp similarity index 100% rename from advanced_source/torch_script_custom_classes/infer.cpp rename to advanced_source/custom_classes/infer.cpp diff --git a/advanced_source/torch_script_custom_classes/run.sh b/advanced_source/custom_classes/run.sh similarity index 100% rename from advanced_source/torch_script_custom_classes/run.sh rename to advanced_source/custom_classes/run.sh diff --git a/advanced_source/torch_script_custom_classes/run2.sh b/advanced_source/custom_classes/run2.sh similarity index 100% rename from advanced_source/torch_script_custom_classes/run2.sh rename to advanced_source/custom_classes/run2.sh diff --git a/advanced_source/custom_ops_landing_page.rst b/advanced_source/custom_ops_landing_page.rst index ebb238ef63e..f05eee43060 100644 --- a/advanced_source/custom_ops_landing_page.rst +++ b/advanced_source/custom_ops_landing_page.rst @@ -1,7 +1,7 @@ .. _custom-ops-landing-page: -PyTorch Custom Operators Landing Page -===================================== +PyTorch Custom Operators +=========================== PyTorch offers a large library of operators that work on Tensors (e.g. ``torch.add``, ``torch.sum``, etc). However, you may wish to bring a new custom operation to PyTorch @@ -10,8 +10,7 @@ In order to do so, you must register the custom operation with PyTorch via the P `torch.library docs `_ or C++ ``TORCH_LIBRARY`` APIs. -TL;DR ------ + Authoring a custom operator from Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -19,17 +18,24 @@ Authoring a custom operator from Python Please see :ref:`python-custom-ops-tutorial`. You may wish to author a custom operator from Python (as opposed to C++) if: + - you have a Python function you want PyTorch to treat as an opaque callable, especially with -respect to ``torch.compile`` and ``torch.export``. + respect to ``torch.compile`` and ``torch.export``. - you have some Python bindings to C++/CUDA kernels and want those to compose with PyTorch -subsystems (like ``torch.compile`` or ``torch.autograd``) + subsystems (like ``torch.compile`` or ``torch.autograd``) +- you are using Python (and not a C++-only environment like AOTInductor). Integrating custom C++ and/or CUDA code with PyTorch ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Please see :ref:`cpp-custom-ops-tutorial`. +.. note:: + + ``SYCL`` serves as the backend programming language for Intel GPUs. Integrate custom Sycl code refer to :ref:`cpp-custom-ops-tutorial-sycl`. + You may wish to author a custom operator from C++ (as opposed to Python) if: + - you have custom C++ and/or CUDA code. - you plan to use this code with ``AOTInductor`` to do Python-less inference. diff --git a/advanced_source/ddp_pipeline.rst b/advanced_source/ddp_pipeline.rst new file mode 100644 index 00000000000..bf9e4d28f33 --- /dev/null +++ b/advanced_source/ddp_pipeline.rst @@ -0,0 +1,10 @@ +Training Transformer models using Distributed Data Parallel and Pipeline Parallelism +==================================================================================== + +This tutorial has been deprecated. + +Redirecting to the latest parallelism APIs in 3 seconds... + +.. raw:: html + + diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 0b5fd3c8aff..4b03803c15b 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -1,6 +1,11 @@ Registering a Dispatched Operator in C++ ======================================== +.. warning:: + + This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page` + for the newest up-to-date guides on extending PyTorch with Custom Operators. + The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like ``torch::add``. This can be nontrivial, because PyTorch operations need diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py deleted file mode 100644 index 9cc07a1d956..00000000000 --- a/advanced_source/dynamic_quantization_tutorial.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -(beta) Dynamic Quantization on an LSTM Word Language Model -================================================================== - -**Author**: `James Reed `_ - -**Edited by**: `Seth Weidman `_ - -Introduction ------------- - -Quantization involves converting the weights and activations of your model from float -to int, which can result in smaller model size and faster inference with only a small -hit to accuracy. - -In this tutorial, we will apply the easiest form of quantization - -`dynamic quantization `_ - -to an LSTM-based next word-prediction model, closely following the -`word language model `_ -from the PyTorch examples. -""" - -# imports -import os -from io import open -import time - -import torch -import torch.nn as nn -import torch.nn.functional as F - -###################################################################### -# 1. Define the model -# ------------------- -# -# Here we define the LSTM model architecture, following the -# `model `_ -# from the word language model example. - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.drop(self.encoder(input)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - def init_hidden(self, bsz): - weight = next(self.parameters()) - return (weight.new_zeros(self.nlayers, bsz, self.nhid), - weight.new_zeros(self.nlayers, bsz, self.nhid)) - -###################################################################### -# 2. Load in the text data -# ------------------------ -# -# Next, we load the -# `Wikitext-2 dataset `_ into a `Corpus`, -# again following the -# `preprocessing `_ -# from the word language model example. - -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.train = self.tokenize(os.path.join(path, 'train.txt')) - self.valid = self.tokenize(os.path.join(path, 'valid.txt')) - self.test = self.tokenize(os.path.join(path, 'test.txt')) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r', encoding="utf8") as f: - for line in f: - words = line.split() + [''] - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r', encoding="utf8") as f: - idss = [] - for line in f: - words = line.split() + [''] - ids = [] - for word in words: - ids.append(self.dictionary.word2idx[word]) - idss.append(torch.tensor(ids).type(torch.int64)) - ids = torch.cat(idss) - - return ids - -model_data_filepath = 'data/' - -corpus = Corpus(model_data_filepath + 'wikitext-2') - -###################################################################### -# 3. Load the pretrained model -# ----------------------------- -# -# This is a tutorial on dynamic quantization, a quantization technique -# that is applied after a model has been trained. Therefore, we'll simply load some -# pretrained weights into this model architecture; these weights were obtained -# by training for five epochs using the default settings in the word language model -# example. - -ntokens = len(corpus.dictionary) - -model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') - ) - ) - -model.eval() -print(model) - -###################################################################### -# Now let's generate some text to ensure that the pretrained model is working -# properly - similarly to before, we follow -# `here `_ - -input_ = torch.randint(ntokens, (1, 1), dtype=torch.long) -hidden = model.init_hidden(1) -temperature = 1.0 -num_words = 1000 - -with open(model_data_filepath + 'out.txt', 'w') as outf: - with torch.no_grad(): # no tracking history - for i in range(num_words): - output, hidden = model(input_, hidden) - word_weights = output.squeeze().div(temperature).exp().cpu() - word_idx = torch.multinomial(word_weights, 1)[0] - input_.fill_(word_idx) - - word = corpus.dictionary.idx2word[word_idx] - - outf.write(str(word.encode('utf-8')) + ('\n' if i % 20 == 19 else ' ')) - - if i % 100 == 0: - print('| Generated {}/{} words'.format(i, 1000)) - -with open(model_data_filepath + 'out.txt', 'r') as outf: - all_output = outf.read() - print(all_output) - -###################################################################### -# It's no GPT-2, but it looks like the model has started to learn the structure of -# language! -# -# We're almost ready to demonstrate dynamic quantization. We just need to define a few more -# helper functions: - -bptt = 25 -criterion = nn.CrossEntropyLoss() -eval_batch_size = 1 - -# create test data set -def batchify(data, bsz): - # Work out how cleanly we can divide the dataset into ``bsz`` parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the ``bsz`` batches. - return data.view(bsz, -1).t().contiguous() - -test_data = batchify(corpus.test, eval_batch_size) - -# Evaluation functions -def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].reshape(-1) - return data, target - -def repackage_hidden(h): - """Wraps hidden states in new Tensors, to detach them from their history.""" - - if isinstance(h, torch.Tensor): - return h.detach() - else: - return tuple(repackage_hidden(v) for v in h) - -def evaluate(model_, data_source): - # Turn on evaluation mode which disables dropout. - model_.eval() - total_loss = 0. - hidden = model_.init_hidden(eval_batch_size) - with torch.no_grad(): - for i in range(0, data_source.size(0) - 1, bptt): - data, targets = get_batch(data_source, i) - output, hidden = model_(data, hidden) - hidden = repackage_hidden(hidden) - output_flat = output.view(-1, ntokens) - total_loss += len(data) * criterion(output_flat, targets).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# 4. Test dynamic quantization -# ---------------------------- -# -# Finally, we can call ``torch.quantization.quantize_dynamic`` on the model! -# Specifically, -# -# - We specify that we want the ``nn.LSTM`` and ``nn.Linear`` modules in our -# model to be quantized -# - We specify that we want weights to be converted to ``int8`` values - -import torch.quantization - -quantized_model = torch.quantization.quantize_dynamic( - model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) -print(quantized_model) - -###################################################################### -# The model looks the same; how has this benefited us? First, we see a -# significant reduction in model size: - -def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - -print_size_of_model(model) -print_size_of_model(quantized_model) - -###################################################################### -# Second, we see faster inference time, with no difference in evaluation loss: -# -# Note: we set the number of threads to one for single threaded comparison, since quantized -# models run single threaded. - -torch.set_num_threads(1) - -def time_model_evaluation(model, test_data): - s = time.time() - loss = evaluate(model, test_data) - elapsed = time.time() - s - print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed)) - -time_model_evaluation(model, test_data) -time_model_evaluation(quantized_model, test_data) - -###################################################################### -# Running this locally on a MacBook Pro, without quantization, inference takes about 200 seconds, -# and with quantization it takes just about 100 seconds. -# -# Conclusion -# ---------- -# -# Dynamic quantization can be an easy way to reduce model size while only -# having a limited effect on accuracy. -# -# Thanks for reading! As always, we welcome any feedback, so please create an issue -# `here `_ if you have any. diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst index f3ae1e7e559..12f15355f5f 100644 --- a/advanced_source/extend_dispatcher.rst +++ b/advanced_source/extend_dispatcher.rst @@ -17,7 +17,7 @@ to `register a dispatched operator in C++ `_ and how to write a What's a new backend? --------------------- -Adding a new backend to PyTorch requires a lot of developement and maintainence from backend extenders. +Adding a new backend to PyTorch requires a lot of development and maintenance from backend extenders. Before adding a new backend, let's first consider a few common use cases and recommended solutions for them: * If you have new algorithms for an existing PyTorch operator, send a PR to PyTorch. @@ -30,7 +30,7 @@ Before adding a new backend, let's first consider a few common use cases and rec In this tutorial we'll mainly focus on adding a new out-of-tree device below. Adding out-of-tree support for a different tensor layout might share many common steps with devices, but we haven't seen an example of -such integrations yet so it might require addtional work from PyTorch to support it. +such integrations yet so it might require additional work from PyTorch to support it. Get a dispatch key for your backend ----------------------------------- @@ -67,12 +67,12 @@ To create a Tensor on ``PrivateUse1`` backend, you need to set dispatch key in ` Note that ``TensorImpl`` class above assumes your Tensor is backed by a storage like CPU/CUDA. We also provide ``OpaqueTensorImpl`` for backends without a storage. And you might need to tweak/override certain methods to fit your customized hardware. -One example in pytorch repo is `Vulkan TensorImpl `_. +One example in pytorch repo is `Vulkan TensorImpl `_. .. note:: Once the prototype is done and you plan to do regular releases for your backend extension, please feel free to - submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispath key for your backend. + submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispatch key for your backend. Get the full list of PyTorch operators @@ -361,7 +361,7 @@ actively working on might improve the experience in the future: * Improve test coverage of generic testing framework. * Improve ``Math`` kernel coverage and more comprehensive tests to make sure ``Math`` - kernel bahavior matches other backends like ``CPU/CUDA``. + kernel behavior matches other backends like ``CPU/CUDA``. * Refactor ``RegistrationDeclarations.h`` to carry the minimal information and reuse PyTorch's codegen as much as possible. * Support a backend fallback kernel to automatic convert inputs to CPU and convert the diff --git a/advanced_source/generic_join.rst b/advanced_source/generic_join.rst index 30259650278..0fb0d5528d2 100644 --- a/advanced_source/generic_join.rst +++ b/advanced_source/generic_join.rst @@ -369,7 +369,7 @@ of inputs across all ranks. def join_hook(self, **kwargs) -> JoinHook: r""" Return a join hook that shadows the all-reduce in :meth:`__call__`. - + This join hook supports the following keyword arguments: sync_max_count (bool, optional): whether to synchronize the maximum count across all ranks once all ranks join; default is ``False``. @@ -446,5 +446,5 @@ Some key points to highlight: .. _Getting Started with Distributed Data Parallel - Basic Use Case: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case .. _Shard Optimizer States with ZeroRedundancyOptimizer: https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html .. _DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html -.. _join(): https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.join +.. _join(): https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join .. _ZeroRedundancyOptimizer: https://pytorch.org/docs/stable/distributed.optim.html diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py index 38524cfff40..3084fe8312b 100644 --- a/advanced_source/pendulum.py +++ b/advanced_source/pendulum.py @@ -33,9 +33,9 @@ In the process, we will touch three crucial components of TorchRL: -* `environments `__ -* `transforms `__ -* `models (policy and value function) `__ +* `environments `__ +* `transforms `__ +* `models (policy and value function) `__ """ @@ -384,7 +384,7 @@ def _reset(self, tensordict): # convenient shortcuts to the content of the output and input spec containers. # # TorchRL offers multiple :class:`~torchrl.data.TensorSpec` -# `subclasses `_ to +# `subclasses `_ to # encode the environment's input and output characteristics. # # Specs shape @@ -604,7 +604,7 @@ def __init__(self, td_params=None, seed=None, device="cpu"): env, # ``Unsqueeze`` the observations that we will concatenate UnsqueezeTransform( - unsqueeze_dim=-1, + dim=-1, in_keys=["th", "thdot"], in_keys_inv=["th", "thdot"], ), diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py index 9111e1f43f4..1f20125f785 100644 --- a/advanced_source/python_custom_ops.py +++ b/advanced_source/python_custom_ops.py @@ -3,7 +3,7 @@ """ .. _python-custom-ops-tutorial: -Python Custom Operators +Custom Python Operators ======================= .. grid:: 2 @@ -30,6 +30,12 @@ into the function). - Adding training support to an arbitrary Python function +Use :func:`torch.library.custom_op` to create Python custom operators. +Use the C++ ``TORCH_LIBRARY`` APIs to create C++ custom operators (these +work in Python-less environments). +See the `Custom Operators Landing Page `_ +for more details. + Please note that if your operation can be expressed as a composition of existing PyTorch operators, then there is usually no need to use the custom operator API -- everything (for example ``torch.compile``, training support) should @@ -66,7 +72,7 @@ def display(img): ###################################################################### # ``crop`` is not handled effectively out-of-the-box by # ``torch.compile``: ``torch.compile`` induces a -# `"graph break" `_ +# `"graph break" `_ # on functions it is unable to handle and graph breaks are bad for performance. # The following code demonstrates this by raising an error # (``torch.compile`` with ``fullgraph=True`` raises an error if a @@ -85,9 +91,9 @@ def f(img): # # 1. wrap the function into a PyTorch custom operator. # 2. add a "``FakeTensor`` kernel" (aka "meta kernel") to the operator. -# Given the metadata (e.g. shapes) -# of the input Tensors, this function says how to compute the metadata -# of the output Tensor(s). +# Given some ``FakeTensors`` inputs (dummy Tensors that don't have storage), +# this function should return dummy Tensors of your choice with the correct +# Tensor metadata (shape/strides/``dtype``/device). from typing import Sequence @@ -106,7 +112,10 @@ def crop(pic: torch.Tensor, box: Sequence[int]) -> torch.Tensor: def _(pic, box): channels = pic.shape[0] x0, y0, x1, y1 = box - return pic.new_empty(channels, y1 - y0, x1 - x0) + result = pic.new_empty(y1 - y0, x1 - x0, channels).permute(2, 0, 1) + # The result should have the same metadata (shape/strides/``dtype``/device) + # as running the ``crop`` function above. + return result ###################################################################### # After this, ``crop`` now works without graph breaks: @@ -130,6 +139,11 @@ def f(img): # ``autograd.Function`` with PyTorch operator registration APIs can lead to (and # has led to) silent incorrectness when composed with ``torch.compile``. # +# If you don't need training support, there is no need to use +# ``torch.library.register_autograd``. +# If you end up training with a ``custom_op`` that doesn't have an autograd +# registration, we'll raise an error message. +# # The gradient formula for ``crop`` is essentially ``PIL.paste`` (we'll leave the # derivation as an exercise to the reader). Let's first wrap ``paste`` into a # custom operator: @@ -203,7 +217,7 @@ def setup_context(ctx, inputs, output): ###################################################################### # Mutable Python Custom operators # ------------------------------- -# You can also wrap a Python function that mutates its inputs into a custom +# You can also wrap a Python function that mutates its inputs into a custom # operator. # Functions that mutate inputs are common because that is how many low-level # kernels are written; for example, a kernel that computes ``sin`` may take in @@ -260,5 +274,5 @@ def f(x): # For more detailed information, see: # # - `the torch.library documentation `_ -# - `the Custom Operators Manual `_ +# - `the Custom Operators Manual `_ # diff --git a/advanced_source/semi_structured_sparse.py b/advanced_source/semi_structured_sparse.py index 38c2c6878b3..e4bca79b9af 100644 --- a/advanced_source/semi_structured_sparse.py +++ b/advanced_source/semi_structured_sparse.py @@ -43,6 +43,8 @@ # - A NVIDIA GPU with semi-structured sparsity support (Compute # Capability 8.0+). # +# .. note:: This tutorial is tested on an NVIDIA A100 80GB GPU. You may not see similar speedups on newer GPU architectures, For the latest information on semi-structured sparsity support, please refer to the README `here +# # This tutorial is designed for beginners to semi-structured sparsity and # sparsity in general. For users with existing 2:4 sparse models, # accelerating ``nn.Linear`` layers for inference with @@ -52,7 +54,6 @@ import torch from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor from torch.utils.benchmark import Timer -SparseSemiStructuredTensor._FORCE_CUTLASS = True # mask Linear weight to be 2:4 sparse mask = torch.Tensor([0, 0, 1, 1]).tile((3072, 2560)).cuda().bool() @@ -207,9 +208,10 @@ import transformers # force CUTLASS use if ``cuSPARSELt`` is not available -SparseSemiStructuredTensor._FORCE_CUTLASS = True torch.manual_seed(100) +# Set default device to "cuda:0" +torch.set_default_device(torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) ###################################################################### # We’ll also need to define some helper functions that are specific to the diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst deleted file mode 100644 index 3b818aa03aa..00000000000 --- a/advanced_source/static_quantization_tutorial.rst +++ /dev/null @@ -1,635 +0,0 @@ -(beta) Static Quantization with Eager Mode in PyTorch -========================================================= -**Author**: `Raghuraman Krishnamoorthi `_ -**Edited by**: `Seth Weidman `_, `Jerry Zhang `_ - -This tutorial shows how to do post-training static quantization, as well as illustrating -two more advanced techniques - per-channel quantization and quantization-aware training - -to further improve the model's accuracy. Note that quantization is currently only supported -for CPUs, so we will not be utilizing GPUs / CUDA in this tutorial. -By the end of this tutorial, you will see how quantization in PyTorch can result in -significant decreases in model size while increasing speed. Furthermore, you'll see how -to easily apply some advanced quantization techniques shown -`here `_ so that your quantized models take much less -of an accuracy hit than they would otherwise. -Warning: we use a lot of boilerplate code from other PyTorch repos to, for example, -define the ``MobileNetV2`` model architecture, define data loaders, and so on. We of course -encourage you to read it; but if you want to get to the quantization features, feel free -to skip to the "4. Post-training static quantization" section. -We'll start by doing the necessary imports: - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - torch.manual_seed(191009) - -1. Model architecture ---------------------- - -We first define the MobileNetV2 model architecture, with several notable modifications -to enable quantization: - -- Replacing addition with ``nn.quantized.FloatFunctional`` -- Insert ``QuantStub`` and ``DeQuantStub`` at the beginning and end of the network. -- Replace ReLU6 with ReLU - -Note: this code is taken from -`here `_. - -.. code:: python - - from torch.ao.quantization import QuantStub, DeQuantStub - - def _make_divisible(v, divisor, min_value=None): - """ - This function is taken from the original tf repo. - It ensures that all layers have a channel number that is divisible by 8 - It can be seen here: - https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py - :param v: - :param divisor: - :param min_value: - :return: - """ - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - - class ConvBNReLU(nn.Sequential): - def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): - padding = (kernel_size - 1) // 2 - super(ConvBNReLU, self).__init__( - nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), - nn.BatchNorm2d(out_planes, momentum=0.1), - # Replace with ReLU - nn.ReLU(inplace=False) - ) - - - class InvertedResidual(nn.Module): - def __init__(self, inp, oup, stride, expand_ratio): - super(InvertedResidual, self).__init__() - self.stride = stride - assert stride in [1, 2] - - hidden_dim = int(round(inp * expand_ratio)) - self.use_res_connect = self.stride == 1 and inp == oup - - layers = [] - if expand_ratio != 1: - # pw - layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) - layers.extend([ - # dw - ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), - # pw-linear - nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), - nn.BatchNorm2d(oup, momentum=0.1), - ]) - self.conv = nn.Sequential(*layers) - # Replace torch.add with floatfunctional - self.skip_add = nn.quantized.FloatFunctional() - - def forward(self, x): - if self.use_res_connect: - return self.skip_add.add(x, self.conv(x)) - else: - return self.conv(x) - - - class MobileNetV2(nn.Module): - def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8): - """ - MobileNet V2 main class - Args: - num_classes (int): Number of classes - width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount - inverted_residual_setting: Network structure - round_nearest (int): Round the number of channels in each layer to be a multiple of this number - Set to 1 to turn off rounding - """ - super(MobileNetV2, self).__init__() - block = InvertedResidual - input_channel = 32 - last_channel = 1280 - - if inverted_residual_setting is None: - inverted_residual_setting = [ - # t, c, n, s - [1, 16, 1, 1], - [6, 24, 2, 2], - [6, 32, 3, 2], - [6, 64, 4, 2], - [6, 96, 3, 1], - [6, 160, 3, 2], - [6, 320, 1, 1], - ] - - # only check the first element, assuming user knows t,c,n,s are required - if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: - raise ValueError("inverted_residual_setting should be non-empty " - "or a 4-element list, got {}".format(inverted_residual_setting)) - - # building first layer - input_channel = _make_divisible(input_channel * width_mult, round_nearest) - self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) - features = [ConvBNReLU(3, input_channel, stride=2)] - # building inverted residual blocks - for t, c, n, s in inverted_residual_setting: - output_channel = _make_divisible(c * width_mult, round_nearest) - for i in range(n): - stride = s if i == 0 else 1 - features.append(block(input_channel, output_channel, stride, expand_ratio=t)) - input_channel = output_channel - # building last several layers - features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1)) - # make it nn.Sequential - self.features = nn.Sequential(*features) - self.quant = QuantStub() - self.dequant = DeQuantStub() - # building classifier - self.classifier = nn.Sequential( - nn.Dropout(0.2), - nn.Linear(self.last_channel, num_classes), - ) - - # weight initialization - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out') - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - nn.init.zeros_(m.bias) - - def forward(self, x): - x = self.quant(x) - x = self.features(x) - x = x.mean([2, 3]) - x = self.classifier(x) - x = self.dequant(x) - return x - - # Fuse Conv+BN and Conv+BN+Relu modules prior to quantization - # This operation does not change the numerics - def fuse_model(self, is_qat=False): - fuse_modules = torch.ao.quantization.fuse_modules_qat if is_qat else torch.ao.quantization.fuse_modules - for m in self.modules(): - if type(m) == ConvBNReLU: - fuse_modules(m, ['0', '1', '2'], inplace=True) - if type(m) == InvertedResidual: - for idx in range(len(m.conv)): - if type(m.conv[idx]) == nn.Conv2d: - fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True) - -2. Helper functions -------------------- - -We next define several helper functions to help with model evaluation. These mostly come from -`here `_. - -.. code:: python - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - - def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - - def evaluate(model, criterion, data_loader, neval_batches): - model.eval() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - print('.', end = '') - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - if cnt >= neval_batches: - return top1, top5 - - return top1, top5 - - def load_model(model_file): - model = MobileNetV2() - state_dict = torch.load(model_file) - model.load_state_dict(state_dict) - model.to('cpu') - return model - - def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - -3. Define dataset and data loaders ----------------------------------- - -As our last major setup step, we define our dataloaders for our training and testing set. - -ImageNet Data -^^^^^^^^^^^^^ - -To run the code in this tutorial using the entire ImageNet dataset, first download imagenet by following the instructions at here `ImageNet Data `_. Unzip the downloaded file into the 'data_path' folder. - -With the data downloaded, we show functions below that define dataloaders we'll use to read -in this data. These functions mostly come from -`here `_. - - -.. code:: python - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - -Next, we'll load in the pre-trained MobileNetV2 model. We provide the URL to download the model -`here `_. - -.. code:: python - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'mobilenet_pretrained_float.pth' - scripted_float_model_file = 'mobilenet_quantization_scripted.pth' - scripted_quantized_model_file = 'mobilenet_quantization_scripted_quantized.pth' - - train_batch_size = 30 - eval_batch_size = 50 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to('cpu') - - # Next, we'll "fuse modules"; this can both make the model faster by saving on memory access - # while also improving numerical accuracy. While this can be used with any model, this is - # especially common with quantized models. - - print('\n Inverted Residual Block: Before fusion \n\n', float_model.features[1].conv) - float_model.eval() - - # Fuses modules - float_model.fuse_model() - - # Note fusion of Conv+BN+Relu and Conv+Relu - print('\n Inverted Residual Block: After fusion\n\n',float_model.features[1].conv) - - -Finally to get a "baseline" accuracy, let's see the accuracy of our un-quantized model -with fused modules - -.. code:: python - - num_eval_batches = 1000 - - print("Size of baseline model") - print_size_of_model(float_model) - - top1, top5 = evaluate(float_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg)) - torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file) - - -On the entire model, we get an accuracy of 71.9% on the eval dataset of 50,000 images. - -This will be our baseline to compare to. Next, let's try different quantization methods - -4. Post-training static quantization ------------------------------------- - -Post-training static quantization involves not just converting the weights from float to int, -as in dynamic quantization, but also performing the additional step of first feeding batches -of data through the network and computing the resulting distributions of the different activations -(specifically, this is done by inserting `observer` modules at different points that record this -data). These distributions are then used to determine how the specifically the different activations -should be quantized at inference time (a simple technique would be to simply divide the entire range -of activations into 256 levels, but we support more sophisticated methods as well). Importantly, -this additional step allows us to pass quantized values between operations instead of converting these -values to floats - and then back to ints - between every operation, resulting in a significant speed-up. - -.. code:: python - - num_calibration_batches = 32 - - myModel = load_model(saved_model_dir + float_model_file).to('cpu') - myModel.eval() - - # Fuse Conv, bn and relu - myModel.fuse_model() - - # Specify quantization configuration - # Start with simple min/max range estimation and per-tensor quantization of weights - myModel.qconfig = torch.ao.quantization.default_qconfig - print(myModel.qconfig) - torch.ao.quantization.prepare(myModel, inplace=True) - - # Calibrate first - print('Post Training Quantization Prepare: Inserting Observers') - print('\n Inverted Residual Block:After observer insertion \n\n', myModel.features[1].conv) - - # Calibrate with the training set - evaluate(myModel, criterion, data_loader, neval_batches=num_calibration_batches) - print('Post Training Quantization: Calibration done') - - # Convert to quantized model - torch.ao.quantization.convert(myModel, inplace=True) - # You may see a user warning about needing to calibrate the model. This warning can be safely ignored. - # This warning occurs because not all modules are run in each model runs, so some - # modules may not be calibrated. - print('Post Training Quantization: Convert done') - print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel.features[1].conv) - - print("Size of model after quantization") - print_size_of_model(myModel) - - top1, top5 = evaluate(myModel, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg)) - -For this quantized model, we see an accuracy of 56.7% on the eval dataset. This is because we used a simple min/max observer to determine quantization parameters. Nevertheless, we did reduce the size of our model down to just under 3.6 MB, almost a 4x decrease. - -In addition, we can significantly improve on the accuracy simply by using a different -quantization configuration. We repeat the same exercise with the recommended configuration for -quantizing for x86 architectures. This configuration does the following: - -- Quantizes weights on a per-channel basis -- Uses a histogram observer that collects a histogram of activations and then picks - quantization parameters in an optimal manner. - -.. code:: python - - per_channel_quantized_model = load_model(saved_model_dir + float_model_file) - per_channel_quantized_model.eval() - per_channel_quantized_model.fuse_model() - # The old 'fbgemm' is still available but 'x86' is the recommended default. - per_channel_quantized_model.qconfig = torch.ao.quantization.get_default_qconfig('x86') - print(per_channel_quantized_model.qconfig) - - torch.ao.quantization.prepare(per_channel_quantized_model, inplace=True) - evaluate(per_channel_quantized_model,criterion, data_loader, num_calibration_batches) - torch.ao.quantization.convert(per_channel_quantized_model, inplace=True) - top1, top5 = evaluate(per_channel_quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg)) - torch.jit.save(torch.jit.script(per_channel_quantized_model), saved_model_dir + scripted_quantized_model_file) - - -Changing just this quantization configuration method resulted in an increase -of the accuracy to over 67.3%! Still, this is 4% worse than the baseline of 71.9% achieved above. -So lets try quantization aware training. - -5. Quantization-aware training ------------------------------- - -Quantization-aware training (QAT) is the quantization method that typically results in the highest accuracy. -With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of -training: that is, float values are rounded to mimic int8 values, but all computations are still done with -floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact -that the model will ultimately be quantized; after quantizing, therefore, this method will usually yield -higher accuracy than either dynamic quantization or post-training static quantization. - -The overall workflow for actually performing QAT is very similar to before: - -- We can use the same model as before: there is no additional preparation needed for quantization-aware - training. -- We need to use a ``qconfig`` specifying what kind of fake-quantization is to be inserted after weights - and activations, instead of specifying observers - -We first define a training function: - -.. code:: python - - def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches): - model.train() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - avgloss = AverageMeter('Loss', '1.5f') - - cnt = 0 - for image, target in data_loader: - start_time = time.time() - print('.', end = '') - cnt += 1 - image, target = image.to(device), target.to(device) - output = model(image) - loss = criterion(output, target) - optimizer.zero_grad() - loss.backward() - optimizer.step() - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - avgloss.update(loss, image.size(0)) - if cnt >= ntrain_batches: - print('Loss', avgloss.avg) - - print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) - return - - print('Full imagenet train set: * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}' - .format(top1=top1, top5=top5)) - return - - -We fuse modules as before - -.. code:: python - - qat_model = load_model(saved_model_dir + float_model_file) - qat_model.fuse_model(is_qat=True) - - optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001) - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86') - -Finally, ``prepare_qat`` performs the "fake quantization", preparing the model for quantization-aware training - -.. code:: python - - torch.ao.quantization.prepare_qat(qat_model, inplace=True) - print('Inverted Residual Block: After preparation for QAT, note fake-quantization modules \n',qat_model.features[1].conv) - -Training a quantized model with high accuracy requires accurate modeling of numerics at -inference. For quantization aware training, therefore, we modify the training loop by: - -- Switch batch norm to use running mean and variance towards the end of training to better - match inference numerics. -- We also freeze the quantizer parameters (scale and zero-point) and fine tune the weights. - -.. code:: python - - num_train_batches = 20 - - # QAT takes time and one needs to train over a few epochs. - # Train and check accuracy after each epoch - for nepoch in range(8): - train_one_epoch(qat_model, criterion, optimizer, data_loader, torch.device('cpu'), num_train_batches) - if nepoch > 3: - # Freeze quantizer parameters - qat_model.apply(torch.ao.quantization.disable_observer) - if nepoch > 2: - # Freeze batch norm mean and variance estimates - qat_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats) - - # Check the accuracy after each epoch - quantized_model = torch.ao.quantization.convert(qat_model.eval(), inplace=False) - quantized_model.eval() - top1, top5 = evaluate(quantized_model,criterion, data_loader_test, neval_batches=num_eval_batches) - print('Epoch %d :Evaluation accuracy on %d images, %2.2f'%(nepoch, num_eval_batches * eval_batch_size, top1.avg)) - -Quantization-aware training yields an accuracy of over 71.5% on the entire imagenet dataset, which is close to the floating point accuracy of 71.9%. - -More on quantization-aware training: - -- QAT is a super-set of post training quant techniques that allows for more debugging. - For example, we can analyze if the accuracy of the model is limited by weight or activation - quantization. -- We can also simulate the accuracy of a quantized model in floating point since - we are using fake-quantization to model the numerics of actual quantized arithmetic. -- We can mimic post training quantization easily too. - -Speedup from quantization -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally, let's confirm something we alluded to above: do our quantized models actually perform inference -faster? Let's test: - -.. code:: python - - def run_benchmark(model_file, img_loader): - elapsed = 0 - model = torch.jit.load(model_file) - model.eval() - num_batches = 5 - # Run the scripted model on a few batches of images - for i, (images, target) in enumerate(img_loader): - if i < num_batches: - start = time.time() - output = model(images) - end = time.time() - elapsed = elapsed + (end-start) - else: - break - num_images = images.size()[0] * num_batches - - print('Elapsed time: %3.0f ms' % (elapsed/num_images*1000)) - return elapsed - - run_benchmark(saved_model_dir + scripted_float_model_file, data_loader_test) - - run_benchmark(saved_model_dir + scripted_quantized_model_file, data_loader_test) - -Running this locally on a MacBook pro yielded 61 ms for the regular model, and -just 20 ms for the quantized model, illustrating the typical 2-4x speedup -we see for quantized models compared to floating point ones. - -Conclusion ----------- - -In this tutorial, we showed two quantization methods - post-training static quantization, -and quantization-aware training - describing what they do "under the hood" and how to use -them in PyTorch. - -Thanks for reading! As always, we welcome any feedback, so please create an issue -`here `_ if you have any. diff --git a/advanced_source/super_resolution_with_onnxruntime.py b/advanced_source/super_resolution_with_onnxruntime.py deleted file mode 100644 index 264678ee17a..00000000000 --- a/advanced_source/super_resolution_with_onnxruntime.py +++ /dev/null @@ -1,356 +0,0 @@ -""" -(optional) Exporting a Model from PyTorch to ONNX and Running it using ONNX Runtime -=================================================================================== - -.. note:: - As of PyTorch 2.1, there are two versions of ONNX Exporter. - - * ``torch.onnx.dynamo_export`` is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0. - * ``torch.onnx.export`` is based on TorchScript backend and has been available since PyTorch 1.2.0. - -In this tutorial, we describe how to convert a model defined -in PyTorch into the ONNX format using the TorchScript ``torch.onnx.export`` ONNX exporter. - -The exported model will be executed with ONNX Runtime. -ONNX Runtime is a performance-focused engine for ONNX models, -which inferences efficiently across multiple platforms and hardware -(Windows, Linux, and Mac and on both CPUs and GPUs). -ONNX Runtime has proved to considerably increase performance over -multiple models as explained `here -`__ - -For this tutorial, you will need to install `ONNX `__ -and `ONNX Runtime `__. -You can get binary builds of ONNX and ONNX Runtime with - -.. code-block:: bash - - %%bash - pip install onnx onnxruntime - -ONNX Runtime recommends using the latest stable runtime for PyTorch. - -""" - -# Some standard imports -import numpy as np - -from torch import nn -import torch.utils.model_zoo as model_zoo -import torch.onnx - - -###################################################################### -# Super-resolution is a way of increasing the resolution of images, videos -# and is widely used in image processing or video editing. For this -# tutorial, we will use a small super-resolution model. -# -# First, let's create a ``SuperResolution`` model in PyTorch. -# This model uses the efficient sub-pixel convolution layer described in -# `"Real-Time Single Image and Video Super-Resolution Using an Efficient -# Sub-Pixel Convolutional Neural Network" - Shi et al `__ -# for increasing the resolution of an image by an upscale factor. -# The model expects the Y component of the ``YCbCr`` of an image as an input, and -# outputs the upscaled Y component in super resolution. -# -# `The -# model `__ -# comes directly from PyTorch's examples without modification: -# - -# Super Resolution model definition in PyTorch -import torch.nn as nn -import torch.nn.init as init - - -class SuperResolutionNet(nn.Module): - def __init__(self, upscale_factor, inplace=False): - super(SuperResolutionNet, self).__init__() - - self.relu = nn.ReLU(inplace=inplace) - self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) - self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) - self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) - self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) - self.pixel_shuffle = nn.PixelShuffle(upscale_factor) - - self._initialize_weights() - - def forward(self, x): - x = self.relu(self.conv1(x)) - x = self.relu(self.conv2(x)) - x = self.relu(self.conv3(x)) - x = self.pixel_shuffle(self.conv4(x)) - return x - - def _initialize_weights(self): - init.orthogonal_(self.conv1.weight, init.calculate_gain('relu')) - init.orthogonal_(self.conv2.weight, init.calculate_gain('relu')) - init.orthogonal_(self.conv3.weight, init.calculate_gain('relu')) - init.orthogonal_(self.conv4.weight) - -# Create the super-resolution model by using the above model definition. -torch_model = SuperResolutionNet(upscale_factor=3) - - -###################################################################### -# Ordinarily, you would now train this model; however, for this tutorial, -# we will instead download some pretrained weights. Note that this model -# was not trained fully for good accuracy and is used here for -# demonstration purposes only. -# -# It is important to call ``torch_model.eval()`` or ``torch_model.train(False)`` -# before exporting the model, to turn the model to inference mode. -# This is required since operators like dropout or batchnorm behave -# differently in inference and training mode. -# - -# Load pretrained model weights -model_url = 'https://s3.amazonaws.com/pytorch/test_data/export/superres_epoch100-44c6958e.pth' -batch_size = 64 # just a random number - -# Initialize model with the pretrained weights -map_location = lambda storage, loc: storage -if torch.cuda.is_available(): - map_location = None -torch_model.load_state_dict(model_zoo.load_url(model_url, map_location=map_location)) - -# set the model to inference mode -torch_model.eval() - - -###################################################################### -# Exporting a model in PyTorch works via tracing or scripting. This -# tutorial will use as an example a model exported by tracing. -# To export a model, we call the ``torch.onnx.export()`` function. -# This will execute the model, recording a trace of what operators -# are used to compute the outputs. -# Because ``export`` runs the model, we need to provide an input -# tensor ``x``. The values in this can be random as long as it is the -# right type and size. -# Note that the input size will be fixed in the exported ONNX graph for -# all the input's dimensions, unless specified as a dynamic axes. -# In this example we export the model with an input of batch_size 1, -# but then specify the first dimension as dynamic in the ``dynamic_axes`` -# parameter in ``torch.onnx.export()``. -# The exported model will thus accept inputs of size [batch_size, 1, 224, 224] -# where batch_size can be variable. -# -# To learn more details about PyTorch's export interface, check out the -# `torch.onnx documentation `__. -# - -# Input to the model -x = torch.randn(batch_size, 1, 224, 224, requires_grad=True) -torch_out = torch_model(x) - -# Export the model -torch.onnx.export(torch_model, # model being run - x, # model input (or a tuple for multiple inputs) - "super_resolution.onnx", # where to save the model (can be a file or file-like object) - export_params=True, # store the trained parameter weights inside the model file - opset_version=10, # the ONNX version to export the model to - do_constant_folding=True, # whether to execute constant folding for optimization - input_names = ['input'], # the model's input names - output_names = ['output'], # the model's output names - dynamic_axes={'input' : {0 : 'batch_size'}, # variable length axes - 'output' : {0 : 'batch_size'}}) - -###################################################################### -# We also computed ``torch_out``, the output after of the model, -# which we will use to verify that the model we exported computes -# the same values when run in ONNX Runtime. -# -# But before verifying the model's output with ONNX Runtime, we will check -# the ONNX model with ONNX API. -# First, ``onnx.load("super_resolution.onnx")`` will load the saved model and -# will output a ``onnx.ModelProto`` structure (a top-level file/container format for bundling a ML model. -# For more information `onnx.proto documentation `__.). -# Then, ``onnx.checker.check_model(onnx_model)`` will verify the model's structure -# and confirm that the model has a valid schema. -# The validity of the ONNX graph is verified by checking the model's -# version, the graph's structure, as well as the nodes and their inputs -# and outputs. -# - -import onnx - -onnx_model = onnx.load("super_resolution.onnx") -onnx.checker.check_model(onnx_model) - - -###################################################################### -# Now let's compute the output using ONNX Runtime's Python APIs. -# This part can normally be done in a separate process or on another -# machine, but we will continue in the same process so that we can -# verify that ONNX Runtime and PyTorch are computing the same value -# for the network. -# -# In order to run the model with ONNX Runtime, we need to create an -# inference session for the model with the chosen configuration -# parameters (here we use the default config). -# Once the session is created, we evaluate the model using the run() API. -# The output of this call is a list containing the outputs of the model -# computed by ONNX Runtime. -# - -import onnxruntime - -ort_session = onnxruntime.InferenceSession("super_resolution.onnx", providers=["CPUExecutionProvider"]) - -def to_numpy(tensor): - return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() - -# compute ONNX Runtime output prediction -ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)} -ort_outs = ort_session.run(None, ort_inputs) - -# compare ONNX Runtime and PyTorch results -np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05) - -print("Exported model has been tested with ONNXRuntime, and the result looks good!") - - -###################################################################### -# We should see that the output of PyTorch and ONNX Runtime runs match -# numerically with the given precision (``rtol=1e-03`` and ``atol=1e-05``). -# As a side-note, if they do not match then there is an issue in the -# ONNX exporter, so please contact us in that case. -# - -###################################################################### -# Timing Comparison Between Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - -###################################################################### -# Since ONNX models optimize for inference speed, running the same -# data on an ONNX model instead of a native pytorch model should result in an -# improvement of up to 2x. Improvement is more pronounced with higher batch sizes. - - -import time - -x = torch.randn(batch_size, 1, 224, 224, requires_grad=True) - -start = time.time() -torch_out = torch_model(x) -end = time.time() -print(f"Inference of Pytorch model used {end - start} seconds") - -ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)} -start = time.time() -ort_outs = ort_session.run(None, ort_inputs) -end = time.time() -print(f"Inference of ONNX model used {end - start} seconds") - - -###################################################################### -# Running the model on an image using ONNX Runtime -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# So far we have exported a model from PyTorch and shown how to load it -# and run it in ONNX Runtime with a dummy tensor as an input. - -###################################################################### -# For this tutorial, we will use a famous cat image used widely which -# looks like below -# -# .. figure:: /_static/img/cat_224x224.jpg -# :alt: cat -# - -###################################################################### -# First, let's load the image, preprocess it using standard PIL -# python library. Note that this preprocessing is the standard practice of -# processing data for training/testing neural networks. -# -# We first resize the image to fit the size of the model's input (224x224). -# Then we split the image into its Y, Cb, and Cr components. -# These components represent a grayscale image (Y), and -# the blue-difference (Cb) and red-difference (Cr) chroma components. -# The Y component being more sensitive to the human eye, we are -# interested in this component which we will be transforming. -# After extracting the Y component, we convert it to a tensor which -# will be the input of our model. -# - -from PIL import Image -import torchvision.transforms as transforms - -img = Image.open("./_static/img/cat.jpg") - -resize = transforms.Resize([224, 224]) -img = resize(img) - -img_ycbcr = img.convert('YCbCr') -img_y, img_cb, img_cr = img_ycbcr.split() - -to_tensor = transforms.ToTensor() -img_y = to_tensor(img_y) -img_y.unsqueeze_(0) - - -###################################################################### -# Now, as a next step, let's take the tensor representing the -# grayscale resized cat image and run the super-resolution model in -# ONNX Runtime as explained previously. -# - -ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(img_y)} -ort_outs = ort_session.run(None, ort_inputs) -img_out_y = ort_outs[0] - - -###################################################################### -# At this point, the output of the model is a tensor. -# Now, we'll process the output of the model to construct back the -# final output image from the output tensor, and save the image. -# The post-processing steps have been adopted from PyTorch -# implementation of super-resolution model -# `here `__. -# - -img_out_y = Image.fromarray(np.uint8((img_out_y[0] * 255.0).clip(0, 255)[0]), mode='L') - -# get the output image follow post-processing step from PyTorch implementation -final_img = Image.merge( - "YCbCr", [ - img_out_y, - img_cb.resize(img_out_y.size, Image.BICUBIC), - img_cr.resize(img_out_y.size, Image.BICUBIC), - ]).convert("RGB") - -# Save the image, we will compare this with the output image from mobile device -final_img.save("./_static/img/cat_superres_with_ort.jpg") - -# Save resized original image (without super-resolution) -img = transforms.Resize([img_out_y.size[0], img_out_y.size[1]])(img) -img.save("cat_resized.jpg") - -###################################################################### -# Here is the comparison between the two images: -# -# .. figure:: /_static/img/cat_resized.jpg -# -# Low-resolution image -# -# .. figure:: /_static/img/cat_superres_with_ort.jpg -# -# Image after super-resolution -# -# -# ONNX Runtime being a cross platform engine, you can run it across -# multiple platforms and on both CPUs and GPUs. -# -# ONNX Runtime can also be deployed to the cloud for model inferencing -# using Azure Machine Learning Services. More information `here `__. -# -# More information about ONNX Runtime's performance `here `__. -# -# -# For more information about ONNX Runtime `here `__. -# diff --git a/advanced_source/torch-script-parallelism.rst b/advanced_source/torch-script-parallelism.rst index 5a2fd86e1f6..56c4bcbaae7 100644 --- a/advanced_source/torch-script-parallelism.rst +++ b/advanced_source/torch-script-parallelism.rst @@ -1,279 +1,3 @@ -Dynamic Parallelism in TorchScript -================================== - -In this tutorial, we introduce the syntax for doing *dynamic inter-op parallelism* -in TorchScript. This parallelism has the following properties: - -* dynamic - The number of parallel tasks created and their workload can depend on the control flow of the program. -* inter-op - The parallelism is concerned with running TorchScript program fragments in parallel. This is distinct from *intra-op parallelism*, which is concerned with splitting up individual operators and running subsets of the operator's work in parallel. -Basic Syntax ------------- - -The two important APIs for dynamic parallelism are: - -* ``torch.jit.fork(fn : Callable[..., T], *args, **kwargs) -> torch.jit.Future[T]`` -* ``torch.jit.wait(fut : torch.jit.Future[T]) -> T`` - -A good way to demonstrate how these work is by way of an example: - -.. code-block:: python - - import torch - - def foo(x): - return torch.neg(x) - - @torch.jit.script - def example(x): - # Call `foo` using parallelism: - # First, we "fork" off a task. This task will run `foo` with argument `x` - future = torch.jit.fork(foo, x) - - # Call `foo` normally - x_normal = foo(x) - - # Second, we "wait" on the task. Since the task may be running in - # parallel, we have to "wait" for its result to become available. - # Notice that by having lines of code between the "fork()" and "wait()" - # call for a given Future, we can overlap computations so that they - # run in parallel. - x_parallel = torch.jit.wait(future) - - return x_normal, x_parallel - - print(example(torch.ones(1))) # (-1., -1.) - - -``fork()`` takes the callable ``fn`` and arguments to that callable ``args`` -and ``kwargs`` and creates an asynchronous task for the execution of ``fn``. -``fn`` can be a function, method, or Module instance. ``fork()`` returns a -reference to the value of the result of this execution, called a ``Future``. -Because ``fork`` returns immediately after creating the async task, ``fn`` may -not have been executed by the time the line of code after the ``fork()`` call -is executed. Thus, ``wait()`` is used to wait for the async task to complete -and return the value. - -These constructs can be used to overlap the execution of statements within a -function (shown in the worked example section) or be composed with other language -constructs like loops: - -.. code-block:: python - - import torch - from typing import List - - def foo(x): - return torch.neg(x) - - @torch.jit.script - def example(x): - futures : List[torch.jit.Future[torch.Tensor]] = [] - for _ in range(100): - futures.append(torch.jit.fork(foo, x)) - - results = [] - for future in futures: - results.append(torch.jit.wait(future)) - - return torch.sum(torch.stack(results)) - - print(example(torch.ones([]))) - -.. note:: - - When we initialized an empty list of Futures, we needed to add an explicit - type annotation to ``futures``. In TorchScript, empty containers default - to assuming they contain Tensor values, so we annotate the list constructor - # as being of type ``List[torch.jit.Future[torch.Tensor]]`` - -This example uses ``fork()`` to launch 100 instances of the function ``foo``, -waits on the 100 tasks to complete, then sums the results, returning ``-100.0``. - -Applied Example: Ensemble of Bidirectional LSTMs ------------------------------------------------- - -Let's try to apply parallelism to a more realistic example and see what sort -of performance we can get out of it. First, let's define the baseline model: an -ensemble of bidirectional LSTM layers. - -.. code-block:: python - - import torch, time - - # In RNN parlance, the dimensions we care about are: - # # of time-steps (T) - # Batch size (B) - # Hidden size/number of "channels" (C) - T, B, C = 50, 50, 1024 - - # A module that defines a single "bidirectional LSTM". This is simply two - # LSTMs applied to the same sequence, but one in reverse - class BidirectionalRecurrentLSTM(torch.nn.Module): - def __init__(self): - super().__init__() - self.cell_f = torch.nn.LSTM(input_size=C, hidden_size=C) - self.cell_b = torch.nn.LSTM(input_size=C, hidden_size=C) - - def forward(self, x : torch.Tensor) -> torch.Tensor: - # Forward layer - output_f, _ = self.cell_f(x) - - # Backward layer. Flip input in the time dimension (dim 0), apply the - # layer, then flip the outputs in the time dimension - x_rev = torch.flip(x, dims=[0]) - output_b, _ = self.cell_b(torch.flip(x, dims=[0])) - output_b_rev = torch.flip(output_b, dims=[0]) - - return torch.cat((output_f, output_b_rev), dim=2) - - - # An "ensemble" of `BidirectionalRecurrentLSTM` modules. The modules in the - # ensemble are run one-by-one on the same input then their results are - # stacked and summed together, returning the combined result. - class LSTMEnsemble(torch.nn.Module): - def __init__(self, n_models): - super().__init__() - self.n_models = n_models - self.models = torch.nn.ModuleList([ - BidirectionalRecurrentLSTM() for _ in range(self.n_models)]) - - def forward(self, x : torch.Tensor) -> torch.Tensor: - results = [] - for model in self.models: - results.append(model(x)) - return torch.stack(results).sum(dim=0) - - # For a head-to-head comparison to what we're going to do with fork/wait, let's - # instantiate the model and compile it with TorchScript - ens = torch.jit.script(LSTMEnsemble(n_models=4)) - - # Normally you would pull this input out of an embedding table, but for the - # purpose of this demo let's just use random data. - x = torch.rand(T, B, C) - - # Let's run the model once to warm up things like the memory allocator - ens(x) - - x = torch.rand(T, B, C) - - # Let's see how fast it runs! - s = time.time() - ens(x) - print('Inference took', time.time() - s, ' seconds') - -On my machine, this network runs in ``2.05`` seconds. We can do a lot better! - -Parallelizing Forward and Backward Layers ------------------------------------------ - -A very simple thing we can do is parallelize the forward and backward layers -within ``BidirectionalRecurrentLSTM``. For this, the structure of the computation -is static, so we don't actually even need any loops. Let's rewrite the ``forward`` -method of ``BidirectionalRecurrentLSTM`` like so: - -.. code-block:: python - - def forward(self, x : torch.Tensor) -> torch.Tensor: - # Forward layer - fork() so this can run in parallel to the backward - # layer - future_f = torch.jit.fork(self.cell_f, x) - - # Backward layer. Flip input in the time dimension (dim 0), apply the - # layer, then flip the outputs in the time dimension - x_rev = torch.flip(x, dims=[0]) - output_b, _ = self.cell_b(torch.flip(x, dims=[0])) - output_b_rev = torch.flip(output_b, dims=[0]) - - # Retrieve the output from the forward layer. Note this needs to happen - # *after* the stuff we want to parallelize with - output_f, _ = torch.jit.wait(future_f) - - return torch.cat((output_f, output_b_rev), dim=2) - -In this example, ``forward()`` delegates execution of ``cell_f`` to another thread, -while it continues to execute ``cell_b``. This causes the execution of both the -cells to be overlapped with each other. - -Running the script again with this simple modification yields a runtime of -``1.71`` seconds for an improvement of ``17%``! - -Aside: Visualizing Parallelism ------------------------------- - -We're not done optimizing our model but it's worth introducing the tooling we -have for visualizing performance. One important tool is the `PyTorch profiler `_. - -Let's use the profiler along with the Chrome trace export functionality to -visualize the performance of our parallelized model: - -.. code-block:: python - - with torch.autograd.profiler.profile() as prof: - ens(x) - prof.export_chrome_trace('parallel.json') - -This snippet of code will write out a file named ``parallel.json``. If you -navigate Google Chrome to ``chrome://tracing``, click the ``Load`` button, and -load in that JSON file, you should see a timeline like the following: - -.. image:: https://i.imgur.com/rm5hdG9.png - -The horizontal axis of the timeline represents time and the vertical axis -represents threads of execution. As we can see, we are running two ``lstm`` -instances at a time. This is the result of our hard work parallelizing the -bidirectional layers! - -Parallelizing Models in the Ensemble ------------------------------------- - -You may have noticed that there is a further parallelization opportunity in our -code: we can also run the models contained in ``LSTMEnsemble`` in parallel with -each other. The way to do that is simple enough, this is how we should change -the ``forward`` method of ``LSTMEnsemble``: - -.. code-block:: python - - def forward(self, x : torch.Tensor) -> torch.Tensor: - # Launch tasks for each model - futures : List[torch.jit.Future[torch.Tensor]] = [] - for model in self.models: - futures.append(torch.jit.fork(model, x)) - - # Collect the results from the launched tasks - results : List[torch.Tensor] = [] - for future in futures: - results.append(torch.jit.wait(future)) - - return torch.stack(results).sum(dim=0) - -Or, if you value brevity, we can use list comprehensions: - -.. code-block:: python - - def forward(self, x : torch.Tensor) -> torch.Tensor: - futures = [torch.jit.fork(model, x) for model in self.models] - results = [torch.jit.wait(fut) for fut in futures] - return torch.stack(results).sum(dim=0) - -Like described in the intro, we've used loops to fork off tasks for each of the -models in our ensemble. We've then used another loop to wait for all of the -tasks to be completed. This provides even more overlap of computation. - -With this small update, the script runs in ``1.4`` seconds, for a total speedup -of ``32%``! Pretty good for two lines of code. - -We can also use the Chrome tracer again to see where's going on: - -.. image:: https://i.imgur.com/kA0gyQm.png - -We can now see that all ``LSTM`` instances are being run fully in parallel. - -Conclusion ----------- - -In this tutorial, we learned about ``fork()`` and ``wait()``, the basic APIs -for doing dynamic, inter-op parallelism in TorchScript. We saw a few typical -usage patterns for using these functions to parallelize the execution of -functions, methods, or ``Modules`` in TorchScript code. Finally, we worked through -an example of optimizing a model using this technique and explored the performance -measurement and visualization tooling available in PyTorch. +.. warning:: + TorchScript is deprecated, please use + `torch.export `__ instead. \ No newline at end of file diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst index 55497d5defa..01bc497d38e 100644 --- a/advanced_source/torch_script_custom_ops.rst +++ b/advanced_source/torch_script_custom_ops.rst @@ -1,1033 +1,6 @@ -Extending TorchScript with Custom C++ Operators -=============================================== +.. + TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 -The PyTorch 1.0 release introduced a new programming model to PyTorch called -`TorchScript `_. TorchScript is a -subset of the Python programming language which can be parsed, compiled and -optimized by the TorchScript compiler. Further, compiled TorchScript models have -the option of being serialized into an on-disk file format, which you can -subsequently load and run from pure C++ (as well as Python) for inference. - -TorchScript supports a large subset of operations provided by the ``torch`` -package, allowing you to express many kinds of complex models purely as a series -of tensor operations from PyTorch's "standard library". Nevertheless, there may -be times where you find yourself in need of extending TorchScript with a custom -C++ or CUDA function. While we recommend that you only resort to this option if -your idea cannot be expressed (efficiently enough) as a simple Python function, -we do provide a very friendly and simple interface for defining custom C++ and -CUDA kernels using `ATen `_, PyTorch's high -performance C++ tensor library. Once bound into TorchScript, you can embed these -custom kernels (or "ops") into your TorchScript model and execute them both in -Python and in their serialized form directly in C++. - -The following paragraphs give an example of writing a TorchScript custom op to -call into `OpenCV `_, a computer vision library written -in C++. We will discuss how to work with tensors in C++, how to efficiently -convert them to third party tensor formats (in this case, OpenCV ``Mat``), how -to register your operator with the TorchScript runtime and finally how to -compile the operator and use it in Python and C++. - -Implementing the Custom Operator in C++ ---------------------------------------- - -For this tutorial, we'll be exposing the `warpPerspective -`_ -function, which applies a perspective transformation to an image, from OpenCV to -TorchScript as a custom operator. The first step is to write the implementation -of our custom operator in C++. Let's call the file for this implementation -``op.cpp`` and make it look like this: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN warp_perspective - :end-before: END warp_perspective - -The code for this operator is quite short. At the top of the file, we include -the OpenCV header file, ``opencv2/opencv.hpp``, alongside the ``torch/script.h`` -header which exposes all the necessary goodies from PyTorch's C++ API that we -need to write custom TorchScript operators. Our function ``warp_perspective`` -takes two arguments: an input ``image`` and the ``warp`` transformation matrix -we wish to apply to the image. The type of these inputs is ``torch::Tensor``, -PyTorch's tensor type in C++ (which is also the underlying type of all tensors -in Python). The return type of our ``warp_perspective`` function will also be a -``torch::Tensor``. - -.. tip:: - - See `this note `_ for - more information about ATen, the library that provides the ``Tensor`` class to - PyTorch. Further, `this tutorial - `_ describes how to - allocate and initialize new tensor objects in C++ (not required for this - operator). - -.. attention:: - - The TorchScript compiler understands a fixed number of types. Only these types - can be used as arguments to your custom operator. Currently these types are: - ``torch::Tensor``, ``torch::Scalar``, ``double``, ``int64_t`` and - ``std::vector`` s of these types. Note that *only* ``double`` and *not* - ``float``, and *only* ``int64_t`` and *not* other integral types such as - ``int``, ``short`` or ``long`` are supported. - -Inside of our function, the first thing we need to do is convert our PyTorch -tensors to OpenCV matrices, as OpenCV's ``warpPerspective`` expects ``cv::Mat`` -objects as inputs. Fortunately, there is a way to do this **without copying -any** data. In the first few lines, - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN image_mat - :end-before: END image_mat - -we are calling `this constructor -`_ -of the OpenCV ``Mat`` class to convert our tensor to a ``Mat`` object. We pass -it the number of rows and columns of the original ``image`` tensor, the datatype -(which we'll fix as ``float32`` for this example), and finally a raw pointer to -the underlying data -- a ``float*``. What is special about this constructor of -the ``Mat`` class is that it does not copy the input data. Instead, it will -simply reference this memory for all operations performed on the ``Mat``. If an -in-place operation is performed on the ``image_mat``, this will be reflected in -the original ``image`` tensor (and vice-versa). This allows us to call -subsequent OpenCV routines with the library's native matrix type, even though -we're actually storing the data in a PyTorch tensor. We repeat this procedure to -convert the ``warp`` PyTorch tensor to the ``warp_mat`` OpenCV matrix: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN warp_mat - :end-before: END warp_mat - -Next, we are ready to call the OpenCV function we were so eager to use in -TorchScript: ``warpPerspective``. For this, we pass the OpenCV function the -``image_mat`` and ``warp_mat`` matrices, as well as an empty output matrix -called ``output_mat``. We also specify the size ``dsize`` we want the output -matrix (image) to be. It is hardcoded to ``8 x 8`` for this example: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN output_mat - :end-before: END output_mat - -The final step in our custom operator implementation is to convert the -``output_mat`` back into a PyTorch tensor, so that we can further use it in -PyTorch. This is strikingly similar to what we did earlier to convert in the -other direction. In this case, PyTorch provides a ``torch::from_blob`` method. A -*blob* in this case is intended to mean some opaque, flat pointer to memory that -we want to interpret as a PyTorch tensor. The call to ``torch::from_blob`` looks -like this: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN output_tensor - :end-before: END output_tensor - -We use the ``.ptr()`` method on the OpenCV ``Mat`` class to get a raw -pointer to the underlying data (just like ``.data_ptr()`` for the PyTorch -tensor earlier). We also specify the output shape of the tensor, which we -hardcoded as ``8 x 8``. The output of ``torch::from_blob`` is then a -``torch::Tensor``, pointing to the memory owned by the OpenCV matrix. - -Before returning this tensor from our operator implementation, we must call -``.clone()`` on the tensor to perform a memory copy of the underlying data. The -reason for this is that ``torch::from_blob`` returns a tensor that does not own -its data. At that point, the data is still owned by the OpenCV matrix. However, -this OpenCV matrix will go out of scope and be deallocated at the end of the -function. If we returned the ``output`` tensor as-is, it would point to invalid -memory by the time we use it outside the function. Calling ``.clone()`` returns -a new tensor with a copy of the original data that the new tensor owns itself. -It is thus safe to return to the outside world. - -Registering the Custom Operator with TorchScript ------------------------------------------------- - -Now that have implemented our custom operator in C++, we need to *register* it -with the TorchScript runtime and compiler. This will allow the TorchScript -compiler to resolve references to our custom operator in TorchScript code. -If you have ever used the pybind11 library, our syntax for registration -resembles the pybind11 syntax very closely. To register a single function, -we write: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp - :language: cpp - :start-after: BEGIN registry - :end-before: END registry - -somewhere at the top level of our ``op.cpp`` file. The ``TORCH_LIBRARY`` macro -creates a function that will be called when your program starts. The name -of your library (``my_ops``) is given as the first argument (it should not -be in quotes). The second argument (``m``) defines a variable of type -``torch::Library`` which is the main interface to register your operators. -The method ``Library::def`` actually creates an operator named ``warp_perspective``, -exposing it to both Python and TorchScript. You can define as many operators -as you like by making multiple calls to ``def``. - -Behinds the scenes, the ``def`` function is actually doing quite a bit of work: -it is using template metaprogramming to inspect the type signature of your -function and translate it into an operator schema which specifies the operators -type within TorchScript's type system. - -Building the Custom Operator ----------------------------- - -Now that we have implemented our custom operator in C++ and written its -registration code, it is time to build the operator into a (shared) library that -we can load into Python for research and experimentation, or into C++ for -inference in a no-Python environment. There exist multiple ways to build our -operator, using either pure CMake, or Python alternatives like ``setuptools``. -For brevity, the paragraphs below only discuss the CMake approach. The appendix -of this tutorial dives into other alternatives. - -Environment setup -***************** - -We need an installation of PyTorch and OpenCV. The easiest and most platform -independent way to get both is to via Conda:: - - conda install -c pytorch pytorch - conda install opencv - -Building with CMake -******************* - -To build our custom operator into a shared library using the `CMake -`_ build system, we need to write a short ``CMakeLists.txt`` -file and place it with our previous ``op.cpp`` file. For this, let's agree on a -a directory structure that looks like this:: - - warp-perspective/ - op.cpp - CMakeLists.txt - -The contents of our ``CMakeLists.txt`` file should then be the following: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/CMakeLists.txt - :language: cpp - -To now build our operator, we can run the following commands from our -``warp_perspective`` folder: - -.. code-block:: shell - - $ mkdir build - $ cd build - $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. - -- The C compiler identification is GNU 5.4.0 - -- The CXX compiler identification is GNU 5.4.0 - -- Check for working C compiler: /usr/bin/cc - -- Check for working C compiler: /usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /usr/bin/c++ - -- Check for working CXX compiler: /usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Looking for pthread_create - -- Looking for pthread_create - not found - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Found torch: /libtorch/lib/libtorch.so - -- Configuring done - -- Generating done - -- Build files have been written to: /warp_perspective/build - $ make -j - Scanning dependencies of target warp_perspective - [ 50%] Building CXX object CMakeFiles/warp_perspective.dir/op.cpp.o - [100%] Linking CXX shared library libwarp_perspective.so - [100%] Built target warp_perspective - -which will place a ``libwarp_perspective.so`` shared library file in the -``build`` folder. In the ``cmake`` command above, we use the helper -variable ``torch.utils.cmake_prefix_path`` to conveniently tell us where -the cmake files for our PyTorch install are. - -We will explore how to use and call our operator in detail further below, but to -get an early sensation of success, we can try running the following code in -Python: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/smoke_test.py - :language: python - -If all goes well, this should print something like:: - - - -which is the Python function we will later use to invoke our custom operator. - -Using the TorchScript Custom Operator in Python ------------------------------------------------ - -Once our custom operator is built into a shared library we are ready to use -this operator in our TorchScript models in Python. There are two parts to this: -first loading the operator into Python, and second using the operator in -TorchScript code. - -You already saw how to import your operator into Python: -``torch.ops.load_library()``. This function takes the path to a shared library -containing custom operators, and loads it into the current process. Loading the -shared library will also execute the ``TORCH_LIBRARY`` block. This will register -our custom operator with the TorchScript compiler and allow us to use that -operator in TorchScript code. - -You can refer to your loaded operator as ``torch.ops..``, -where ```` is the namespace part of your operator name, and -```` the function name of your operator. For the operator we wrote -above, the namespace was ``my_ops`` and the function name ``warp_perspective``, -which means our operator is available as ``torch.ops.my_ops.warp_perspective``. -While this function can be used in scripted or traced TorchScript modules, we -can also just use it in vanilla eager PyTorch and pass it regular PyTorch -tensors: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py - :language: python - :prepend: import torch - :start-after: BEGIN preamble - :end-before: END preamble - -producing: - -.. code-block:: python - - tensor([[0.0000, 0.3218, 0.4611, ..., 0.4636, 0.4636, 0.4636], - [0.3746, 0.0978, 0.5005, ..., 0.4636, 0.4636, 0.4636], - [0.3245, 0.0169, 0.0000, ..., 0.4458, 0.4458, 0.4458], - ..., - [0.1862, 0.1862, 0.1692, ..., 0.0000, 0.0000, 0.0000], - [0.1862, 0.1862, 0.1692, ..., 0.0000, 0.0000, 0.0000], - [0.1862, 0.1862, 0.1692, ..., 0.0000, 0.0000, 0.0000]]) - - -.. note:: - - What happens behind the scenes is that the first time you access - ``torch.ops.namespace.function`` in Python, the TorchScript compiler (in C++ - land) will see if a function ``namespace::function`` has been registered, and - if so, return a Python handle to this function that we can subsequently use to - call into our C++ operator implementation from Python. This is one noteworthy - difference between TorchScript custom operators and C++ extensions: C++ - extensions are bound manually using pybind11, while TorchScript custom ops are - bound on the fly by PyTorch itself. Pybind11 gives you more flexibility with - regards to what types and classes you can bind into Python and is thus - recommended for purely eager code, but it is not supported for TorchScript - ops. - -From here on, you can use your custom operator in scripted or traced code just -as you would other functions from the ``torch`` package. In fact, "standard -library" functions like ``torch.matmul`` go through largely the same -registration path as custom operators, which makes custom operators really -first-class citizens when it comes to how and where they can be used in -TorchScript. (One difference, however, is that standard library functions -have custom written Python argument parsing logic that differs from -``torch.ops`` argument parsing.) - -Using the Custom Operator with Tracing -************************************** - -Let's start by embedding our operator in a traced function. Recall that for -tracing, we start with some vanilla Pytorch code: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py - :language: python - :start-after: BEGIN compute - :end-before: END compute - -and then call ``torch.jit.trace`` on it. We further pass ``torch.jit.trace`` -some example inputs, which it will forward to our implementation to record the -sequence of operations that occur as the inputs flow through it. The result of -this is effectively a "frozen" version of the eager PyTorch program, which the -TorchScript compiler can further analyze, optimize and serialize: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py - :language: python - :start-after: BEGIN trace - :end-before: END trace - -Producing:: - - graph(%x : Float(4:8, 8:1), - %y : Float(8:5, 5:1), - %z : Float(4:5, 5:1)): - %3 : Float(4:5, 5:1) = aten::matmul(%x, %y) # test.py:10:0 - %4 : Float(4:5, 5:1) = aten::relu(%z) # test.py:10:0 - %5 : int = prim::Constant[value=1]() # test.py:10:0 - %6 : Float(4:5, 5:1) = aten::add(%3, %4, %5) # test.py:10:0 - return (%6) - -Now, the exciting revelation is that we can simply drop our custom operator into -our PyTorch trace as if it were ``torch.relu`` or any other ``torch`` function: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py - :language: python - :start-after: BEGIN compute2 - :end-before: END compute2 - -and then trace it as before: - -.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py - :language: python - :start-after: BEGIN trace2 - :end-before: END trace2 - -Producing:: - - graph(%x.1 : Float(4:8, 8:1), - %y : Float(8:5, 5:1), - %z : Float(8:5, 5:1)): - %3 : int = prim::Constant[value=3]() # test.py:25:0 - %4 : int = prim::Constant[value=6]() # test.py:25:0 - %5 : int = prim::Constant[value=0]() # test.py:25:0 - %6 : Device = prim::Constant[value="cpu"]() # test.py:25:0 - %7 : bool = prim::Constant[value=0]() # test.py:25:0 - %8 : Float(3:3, 3:1) = aten::eye(%3, %4, %5, %6, %7) # test.py:25:0 - %x : Float(8:8, 8:1) = my_ops::warp_perspective(%x.1, %8) # test.py:25:0 - %10 : Float(8:5, 5:1) = aten::matmul(%x, %y) # test.py:26:0 - %11 : Float(8:5, 5:1) = aten::relu(%z) # test.py:26:0 - %12 : int = prim::Constant[value=1]() # test.py:26:0 - %13 : Float(8:5, 5:1) = aten::add(%10, %11, %12) # test.py:26:0 - return (%13) - -Integrating TorchScript custom ops into traced PyTorch code is as easy as this! - -Using the Custom Operator with Script -************************************* - -Besides tracing, another way to arrive at a TorchScript representation of a -PyTorch program is to directly write your code *in* TorchScript. TorchScript is -largely a subset of the Python language, with some restrictions that make it -easier for the TorchScript compiler to reason about programs. You turn your -regular PyTorch code into TorchScript by annotating it with -``@torch.jit.script`` for free functions and ``@torch.jit.script_method`` for -methods in a class (which must also derive from ``torch.jit.ScriptModule``). See -`here `_ for more details on -TorchScript annotations. - -One particular reason to use TorchScript instead of tracing is that tracing is -unable to capture control flow in PyTorch code. As such, let us consider this -function which does use control flow: - -.. code-block:: python - - def compute(x, y): - if bool(x[0][0] == 42): - z = 5 - else: - z = 10 - return x.matmul(y) + z - -To convert this function from vanilla PyTorch to TorchScript, we annotate it -with ``@torch.jit.script``: - -.. code-block:: python - - @torch.jit.script - def compute(x, y): - if bool(x[0][0] == 42): - z = 5 - else: - z = 10 - return x.matmul(y) + z - -This will just-in-time compile the ``compute`` function into a graph -representation, which we can inspect in the ``compute.graph`` property: - -.. code-block:: python - - >>> compute.graph - graph(%x : Dynamic - %y : Dynamic) { - %14 : int = prim::Constant[value=1]() - %2 : int = prim::Constant[value=0]() - %7 : int = prim::Constant[value=42]() - %z.1 : int = prim::Constant[value=5]() - %z.2 : int = prim::Constant[value=10]() - %4 : Dynamic = aten::select(%x, %2, %2) - %6 : Dynamic = aten::select(%4, %2, %2) - %8 : Dynamic = aten::eq(%6, %7) - %9 : bool = prim::TensorToBool(%8) - %z : int = prim::If(%9) - block0() { - -> (%z.1) - } - block1() { - -> (%z.2) - } - %13 : Dynamic = aten::matmul(%x, %y) - %15 : Dynamic = aten::add(%13, %z, %14) - return (%15); - } - -And now, just like before, we can use our custom operator like any other -function inside of our script code: - -.. code-block:: python - - torch.ops.load_library("libwarp_perspective.so") - - @torch.jit.script - def compute(x, y): - if bool(x[0] == 42): - z = 5 - else: - z = 10 - x = torch.ops.my_ops.warp_perspective(x, torch.eye(3)) - return x.matmul(y) + z - -When the TorchScript compiler sees the reference to -``torch.ops.my_ops.warp_perspective``, it will find the implementation we -registered via the ``TORCH_LIBRARY`` function in C++, and compile it into its -graph representation: - -.. code-block:: python - - >>> compute.graph - graph(%x.1 : Dynamic - %y : Dynamic) { - %20 : int = prim::Constant[value=1]() - %16 : int[] = prim::Constant[value=[0, -1]]() - %14 : int = prim::Constant[value=6]() - %2 : int = prim::Constant[value=0]() - %7 : int = prim::Constant[value=42]() - %z.1 : int = prim::Constant[value=5]() - %z.2 : int = prim::Constant[value=10]() - %13 : int = prim::Constant[value=3]() - %4 : Dynamic = aten::select(%x.1, %2, %2) - %6 : Dynamic = aten::select(%4, %2, %2) - %8 : Dynamic = aten::eq(%6, %7) - %9 : bool = prim::TensorToBool(%8) - %z : int = prim::If(%9) - block0() { - -> (%z.1) - } - block1() { - -> (%z.2) - } - %17 : Dynamic = aten::eye(%13, %14, %2, %16) - %x : Dynamic = my_ops::warp_perspective(%x.1, %17) - %19 : Dynamic = aten::matmul(%x, %y) - %21 : Dynamic = aten::add(%19, %z, %20) - return (%21); - } - -Notice in particular the reference to ``my_ops::warp_perspective`` at the end of -the graph. - -.. attention:: - - The TorchScript graph representation is still subject to change. Do not rely - on it looking like this. - -And that's really it when it comes to using our custom operator in Python. In -short, you import the library containing your operator(s) using -``torch.ops.load_library``, and call your custom op like any other ``torch`` -operator from your traced or scripted TorchScript code. - -Using the TorchScript Custom Operator in C++ --------------------------------------------- - -One useful feature of TorchScript is the ability to serialize a model into an -on-disk file. This file can be sent over the wire, stored in a file system or, -more importantly, be dynamically deserialized and executed without needing to -keep the original source code around. This is possible in Python, but also in -C++. For this, PyTorch provides `a pure C++ API `_ -for deserializing as well as executing TorchScript models. If you haven't yet, -please read `the tutorial on loading and running serialized TorchScript models -in C++ `_, on which the -next few paragraphs will build. - -In short, custom operators can be executed just like regular ``torch`` operators -even when deserialized from a file and run in C++. The only requirement for this -is to link the custom operator shared library we built earlier with the C++ -application in which we execute the model. In Python, this worked simply calling -``torch.ops.load_library``. In C++, you need to link the shared library with -your main application in whatever build system you are using. The following -example will showcase this using CMake. - -.. note:: - - Technically, you can also dynamically load the shared library into your C++ - application at runtime in much the same way we did it in Python. On Linux, - `you can do this with dlopen - `_. There exist - equivalents on other platforms. - -Building on the C++ execution tutorial linked above, let's start with a minimal -C++ application in one file, ``main.cpp`` in a different folder from our -custom operator, that loads and executes a serialized TorchScript model: - -.. code-block:: cpp - - #include // One-stop header. - - #include - #include - - - int main(int argc, const char* argv[]) { - if (argc != 2) { - std::cerr << "usage: example-app \n"; - return -1; - } - - // Deserialize the ScriptModule from a file using torch::jit::load(). - torch::jit::script::Module module = torch::jit::load(argv[1]); - - std::vector inputs; - inputs.push_back(torch::randn({4, 8})); - inputs.push_back(torch::randn({8, 5})); - - torch::Tensor output = module.forward(std::move(inputs)).toTensor(); - - std::cout << output << std::endl; - } - -Along with a small ``CMakeLists.txt`` file: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.1 FATAL_ERROR) - project(example_app) - - find_package(Torch REQUIRED) - - add_executable(example_app main.cpp) - target_link_libraries(example_app "${TORCH_LIBRARIES}") - target_compile_features(example_app PRIVATE cxx_range_for) - -At this point, we should be able to build the application: - -.. code-block:: shell - - $ mkdir build - $ cd build - $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. - -- The C compiler identification is GNU 5.4.0 - -- The CXX compiler identification is GNU 5.4.0 - -- Check for working C compiler: /usr/bin/cc - -- Check for working C compiler: /usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /usr/bin/c++ - -- Check for working CXX compiler: /usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Looking for pthread_create - -- Looking for pthread_create - not found - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Found torch: /libtorch/lib/libtorch.so - -- Configuring done - -- Generating done - -- Build files have been written to: /example_app/build - $ make -j - Scanning dependencies of target example_app - [ 50%] Building CXX object CMakeFiles/example_app.dir/main.cpp.o - [100%] Linking CXX executable example_app - [100%] Built target example_app - -And run it without passing a model just yet: - -.. code-block:: shell - - $ ./example_app - usage: example_app - -Next, let's serialize the script function we wrote earlier that uses our custom -operator: - -.. code-block:: python - - torch.ops.load_library("libwarp_perspective.so") - - @torch.jit.script - def compute(x, y): - if bool(x[0][0] == 42): - z = 5 - else: - z = 10 - x = torch.ops.my_ops.warp_perspective(x, torch.eye(3)) - return x.matmul(y) + z - - compute.save("example.pt") - -The last line will serialize the script function into a file called -"example.pt". If we then pass this serialized model to our C++ application, we -can run it straight away: - -.. code-block:: shell - - $ ./example_app example.pt - terminate called after throwing an instance of 'torch::jit::script::ErrorReport' - what(): - Schema not found for node. File a bug report. - Node: %16 : Dynamic = my_ops::warp_perspective(%0, %19) - -Or maybe not. Maybe not just yet. Of course! We haven't linked the custom -operator library with our application yet. Let's do this right now, and to do it -properly let's update our file organization slightly, to look like this:: - - example_app/ - CMakeLists.txt - main.cpp - warp_perspective/ - CMakeLists.txt - op.cpp - -This will allow us to add the ``warp_perspective`` library CMake target as a -subdirectory of our application target. The top level ``CMakeLists.txt`` in the -``example_app`` folder should look like this: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.1 FATAL_ERROR) - project(example_app) - - find_package(Torch REQUIRED) - - add_subdirectory(warp_perspective) - - add_executable(example_app main.cpp) - target_link_libraries(example_app "${TORCH_LIBRARIES}") - target_link_libraries(example_app -Wl,--no-as-needed warp_perspective) - target_compile_features(example_app PRIVATE cxx_range_for) - -This basic CMake configuration looks much like before, except that we add the -``warp_perspective`` CMake build as a subdirectory. Once its CMake code runs, we -link our ``example_app`` application with the ``warp_perspective`` shared -library. - -.. attention:: - - There is one crucial detail embedded in the above example: The - ``-Wl,--no-as-needed`` prefix to the ``warp_perspective`` link line. This is - required because we will not actually be calling any function from the - ``warp_perspective`` shared library in our application code. We only need the - ``TORCH_LIBRARY`` function to run. Inconveniently, this - confuses the linker and makes it think it can just skip linking against the - library altogether. On Linux, the ``-Wl,--no-as-needed`` flag forces the link - to happen (NB: this flag is specific to Linux!). There are other workarounds - for this. The simplest is to define *some function* in the operator library - that you need to call from the main application. This could be as simple as a - function ``void init();`` declared in some header, which is then defined as - ``void init() { }`` in the operator library. Calling this ``init()`` function - in the main application will give the linker the impression that this is a - library worth linking against. Unfortunately, this is outside of our control, - and we would rather let you know the reason and the simple workaround for this - than handing you some opaque macro to plop in your code. - -Now, since we find the ``Torch`` package at the top level now, the -``CMakeLists.txt`` file in the ``warp_perspective`` subdirectory can be -shortened a bit. It should look like this: - -.. code-block:: cmake - - find_package(OpenCV REQUIRED) - add_library(warp_perspective SHARED op.cpp) - target_compile_features(warp_perspective PRIVATE cxx_range_for) - target_link_libraries(warp_perspective PRIVATE "${TORCH_LIBRARIES}") - target_link_libraries(warp_perspective PRIVATE opencv_core opencv_photo) - -Let's re-build our example app, which will also link with the custom operator -library. In the top level ``example_app`` directory: - -.. code-block:: shell - - $ mkdir build - $ cd build - $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. - -- The C compiler identification is GNU 5.4.0 - -- The CXX compiler identification is GNU 5.4.0 - -- Check for working C compiler: /usr/bin/cc - -- Check for working C compiler: /usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /usr/bin/c++ - -- Check for working CXX compiler: /usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Looking for pthread_create - -- Looking for pthread_create - not found - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Found torch: /libtorch/lib/libtorch.so - -- Configuring done - -- Generating done - -- Build files have been written to: /warp_perspective/example_app/build - $ make -j - Scanning dependencies of target warp_perspective - [ 25%] Building CXX object warp_perspective/CMakeFiles/warp_perspective.dir/op.cpp.o - [ 50%] Linking CXX shared library libwarp_perspective.so - [ 50%] Built target warp_perspective - Scanning dependencies of target example_app - [ 75%] Building CXX object CMakeFiles/example_app.dir/main.cpp.o - [100%] Linking CXX executable example_app - [100%] Built target example_app - -If we now run the ``example_app`` binary and hand it our serialized model, we -should arrive at a happy ending: - -.. code-block:: shell - - $ ./example_app example.pt - 11.4125 5.8262 9.5345 8.6111 12.3997 - 7.4683 13.5969 9.0850 11.0698 9.4008 - 7.4597 15.0926 12.5727 8.9319 9.0666 - 9.4834 11.1747 9.0162 10.9521 8.6269 - 10.0000 10.0000 10.0000 10.0000 10.0000 - 10.0000 10.0000 10.0000 10.0000 10.0000 - 10.0000 10.0000 10.0000 10.0000 10.0000 - 10.0000 10.0000 10.0000 10.0000 10.0000 - [ Variable[CPUFloatType]{8,5} ] - -Success! You are now ready to inference away. - -Conclusion ----------- - -This tutorial walked you throw how to implement a custom TorchScript operator in -C++, how to build it into a shared library, how to use it in Python to define -TorchScript models and lastly how to load it into a C++ application for -inference workloads. You are now ready to extend your TorchScript models with -C++ operators that interface with third party C++ libraries, write custom high -performance CUDA kernels, or implement any other use case that requires the -lines between Python, TorchScript and C++ to blend smoothly. - -As always, if you run into any problems or have questions, you can use our -`forum `_ or `GitHub issues -`_ to get in touch. Also, our -`frequently asked questions (FAQ) page -`_ may have helpful information. - -Appendix A: More Ways of Building Custom Operators --------------------------------------------------- - -The section "Building the Custom Operator" explained how to build a custom -operator into a shared library using CMake. This appendix outlines two further -approaches for compilation. Both of them use Python as the "driver" or -"interface" to the compilation process. Also, both re-use the `existing -infrastructure `_ PyTorch -provides for `*C++ extensions* -`_, which are the -vanilla (eager) PyTorch equivalent of TorchScript custom operators that rely on -`pybind11 `_ for "explicit" binding of -functions from C++ into Python. - -The first approach uses C++ extensions' `convenient just-in-time (JIT) -compilation interface -`_ -to compile your code in the background of your PyTorch script the first time you -run it. The second approach relies on the venerable ``setuptools`` package and -involves writing a separate ``setup.py`` file. This allows more advanced -configuration as well as integration with other ``setuptools``-based projects. -We will explore both approaches in detail below. - -Building with JIT compilation -***************************** - -The JIT compilation feature provided by the PyTorch C++ extension toolkit allows -embedding the compilation of your custom operator directly into your Python -code, e.g. at the top of your training script. - -.. note:: - - "JIT compilation" here has nothing to do with the JIT compilation taking place - in the TorchScript compiler to optimize your program. It simply means that - your custom operator C++ code will be compiled in a folder under your system's - `/tmp` directory the first time you import it, as if you had compiled it - yourself beforehand. - -This JIT compilation feature comes in two flavors. In the first, you still keep -your operator implementation in a separate file (``op.cpp``), and then use -``torch.utils.cpp_extension.load()`` to compile your extension. Usually, this -function will return the Python module exposing your C++ extension. However, -since we are not compiling our custom operator into its own Python module, we -only want to compile a plain shared library . Fortunately, -``torch.utils.cpp_extension.load()`` has an argument ``is_python_module`` which -we can set to ``False`` to indicate that we are only interested in building a -shared library and not a Python module. ``torch.utils.cpp_extension.load()`` -will then compile and also load the shared library into the current process, -just like ``torch.ops.load_library`` did before: - -.. code-block:: python - - import torch.utils.cpp_extension - - torch.utils.cpp_extension.load( - name="warp_perspective", - sources=["op.cpp"], - extra_ldflags=["-lopencv_core", "-lopencv_imgproc"], - is_python_module=False, - verbose=True - ) - - print(torch.ops.my_ops.warp_perspective) - -This should approximately print: - -.. code-block:: python - - - -The second flavor of JIT compilation allows you to pass the source code for your -custom TorchScript operator as a string. For this, use -``torch.utils.cpp_extension.load_inline``: - -.. code-block:: python - - import torch - import torch.utils.cpp_extension - - op_source = """ - #include - #include - - torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data()); - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data()); - - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{64, 64}); - - torch::Tensor output = - torch::from_blob(output_mat.ptr(), /*sizes=*/{64, 64}); - return output.clone(); - } - - TORCH_LIBRARY(my_ops, m) { - m.def("warp_perspective", &warp_perspective); - } - """ - - torch.utils.cpp_extension.load_inline( - name="warp_perspective", - cpp_sources=op_source, - extra_ldflags=["-lopencv_core", "-lopencv_imgproc"], - is_python_module=False, - verbose=True, - ) - - print(torch.ops.my_ops.warp_perspective) - -Naturally, it is best practice to only use -``torch.utils.cpp_extension.load_inline`` if your source code is reasonably -short. - -Note that if you're using this in a Jupyter Notebook, you should not execute -the cell with the registration multiple times because each execution registers -a new library and re-registers the custom operator. If you need to re-execute it, -please restart the Python kernel of your notebook beforehand. - -Building with Setuptools -************************ - -The second approach to building our custom operator exclusively from Python is -to use ``setuptools``. This has the advantage that ``setuptools`` has a quite -powerful and extensive interface for building Python modules written in C++. -However, since ``setuptools`` is really intended for building Python modules and -not plain shared libraries (which do not have the necessary entry points Python -expects from a module), this route can be slightly quirky. That said, all you -need is a ``setup.py`` file in place of the ``CMakeLists.txt`` which looks like -this: - -.. code-block:: python - - from setuptools import setup - from torch.utils.cpp_extension import BuildExtension, CppExtension - - setup( - name="warp_perspective", - ext_modules=[ - CppExtension( - "warp_perspective", - ["example_app/warp_perspective/op.cpp"], - libraries=["opencv_core", "opencv_imgproc"], - ) - ], - cmdclass={"build_ext": BuildExtension.with_options(no_python_abi_suffix=True)}, - ) - - -Notice that we enabled the ``no_python_abi_suffix`` option in the -``BuildExtension`` at the bottom. This instructs ``setuptools`` to omit any -Python-3 specific ABI suffixes in the name of the produced shared library. -Otherwise, on Python 3.7 for example, the library may be called -``warp_perspective.cpython-37m-x86_64-linux-gnu.so`` where -``cpython-37m-x86_64-linux-gnu`` is the ABI tag, but we really just want it to -be called ``warp_perspective.so`` - -If we now run ``python setup.py build develop`` in a terminal from within the -folder in which ``setup.py`` is situated, we should see something like: - -.. code-block:: shell - - $ python setup.py build develop - running build - running build_ext - building 'warp_perspective' extension - creating build - creating build/temp.linux-x86_64-3.7 - gcc -pthread -B /root/local/miniconda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/root/local/miniconda/lib/python3.7/site-packages/torch/lib/include -I/root/local/miniconda/lib/python3.7/site-packages/torch/lib/include/torch/csrc/api/include -I/root/local/miniconda/lib/python3.7/site-packages/torch/lib/include/TH -I/root/local/miniconda/lib/python3.7/site-packages/torch/lib/include/THC -I/root/local/miniconda/include/python3.7m -c op.cpp -o build/temp.linux-x86_64-3.7/op.o -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=warp_perspective -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 - cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++ - creating build/lib.linux-x86_64-3.7 - g++ -pthread -shared -B /root/local/miniconda/compiler_compat -L/root/local/miniconda/lib -Wl,-rpath=/root/local/miniconda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.7/op.o -lopencv_core -lopencv_imgproc -o build/lib.linux-x86_64-3.7/warp_perspective.so - running develop - running egg_info - creating warp_perspective.egg-info - writing warp_perspective.egg-info/PKG-INFO - writing dependency_links to warp_perspective.egg-info/dependency_links.txt - writing top-level names to warp_perspective.egg-info/top_level.txt - writing manifest file 'warp_perspective.egg-info/SOURCES.txt' - reading manifest file 'warp_perspective.egg-info/SOURCES.txt' - writing manifest file 'warp_perspective.egg-info/SOURCES.txt' - running build_ext - copying build/lib.linux-x86_64-3.7/warp_perspective.so -> - Creating /root/local/miniconda/lib/python3.7/site-packages/warp-perspective.egg-link (link to .) - Adding warp-perspective 0.0.0 to easy-install.pth file - - Installed /warp_perspective - Processing dependencies for warp-perspective==0.0.0 - Finished processing dependencies for warp-perspective==0.0.0 - -This will produce a shared library called ``warp_perspective.so``, which we can -pass to ``torch.ops.load_library`` as we did earlier to make our operator -visible to TorchScript: - -.. code-block:: python - - >>> import torch - >>> torch.ops.load_library("warp_perspective.so") - >>> print(torch.ops.my_ops.warp_perspective) - +.. warning:: + TorchScript is deprecated, please use + `torch.export `__ instead. \ No newline at end of file diff --git a/advanced_source/torch_script_custom_ops/CMakeLists.txt b/advanced_source/torch_script_custom_ops/CMakeLists.txt deleted file mode 100644 index e116153b941..00000000000 --- a/advanced_source/torch_script_custom_ops/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(warp_perspective) - -find_package(Torch REQUIRED) -find_package(OpenCV REQUIRED) - -# Define our library target -add_library(warp_perspective SHARED op.cpp) -# Enable C++14 -target_compile_features(warp_perspective PRIVATE cxx_std_14) -# Link against LibTorch -target_link_libraries(warp_perspective "${TORCH_LIBRARIES}") -# Link against OpenCV -target_link_libraries(warp_perspective opencv_core opencv_imgproc) diff --git a/advanced_source/torch_script_custom_ops/op.cpp b/advanced_source/torch_script_custom_ops/op.cpp deleted file mode 100644 index ff2eb049c4c..00000000000 --- a/advanced_source/torch_script_custom_ops/op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include - -// BEGIN warp_perspective -torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - // BEGIN image_mat - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data_ptr()); - // END image_mat - - // BEGIN warp_mat - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data_ptr()); - // END warp_mat - - // BEGIN output_mat - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8}); - // END output_mat - - // BEGIN output_tensor - torch::Tensor output = torch::from_blob(output_mat.ptr(), /*sizes=*/{8, 8}); - return output.clone(); - // END output_tensor -} -// END warp_perspective - -// BEGIN registry -TORCH_LIBRARY(my_ops, m) { - m.def("warp_perspective", warp_perspective); -} -// END registry diff --git a/advanced_source/torch_script_custom_ops/smoke_test.py b/advanced_source/torch_script_custom_ops/smoke_test.py deleted file mode 100644 index fa629ddcafb..00000000000 --- a/advanced_source/torch_script_custom_ops/smoke_test.py +++ /dev/null @@ -1,3 +0,0 @@ -import torch -torch.ops.load_library("build/libwarp_perspective.so") -print(torch.ops.my_ops.warp_perspective) diff --git a/advanced_source/torch_script_custom_ops/test.py b/advanced_source/torch_script_custom_ops/test.py deleted file mode 100644 index 26f96ef4599..00000000000 --- a/advanced_source/torch_script_custom_ops/test.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch - - -print("BEGIN preamble") -torch.ops.load_library("build/libwarp_perspective.so") -print(torch.ops.my_ops.warp_perspective(torch.randn(32, 32), torch.rand(3, 3))) -print("END preamble") - - -# BEGIN compute -def compute(x, y, z): - return x.matmul(y) + torch.relu(z) -# END compute - - -print("BEGIN trace") -inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(4, 5)] -trace = torch.jit.trace(compute, inputs) -print(trace.graph) -print("END trace") - - -# BEGIN compute2 -def compute(x, y, z): - x = torch.ops.my_ops.warp_perspective(x, torch.eye(3)) - return x.matmul(y) + torch.relu(z) -# END compute2 - - -print("BEGIN trace2") -inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(8, 5)] -trace = torch.jit.trace(compute, inputs) -print(trace.graph) -print("END trace2") diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py deleted file mode 100644 index 21ee32ff384..00000000000 --- a/beginner_source/Intro_to_TorchScript_tutorial.py +++ /dev/null @@ -1,398 +0,0 @@ -""" -Introduction to TorchScript -=========================== - -**Authors:** James Reed (jamesreed@fb.com), Michael Suo (suo@fb.com), rev2 - -This tutorial is an introduction to TorchScript, an intermediate -representation of a PyTorch model (subclass of ``nn.Module``) that -can then be run in a high-performance environment such as C++. - -In this tutorial we will cover: - -1. The basics of model authoring in PyTorch, including: - -- Modules -- Defining ``forward`` functions -- Composing modules into a hierarchy of modules - -2. Specific methods for converting PyTorch modules to TorchScript, our - high-performance deployment runtime - -- Tracing an existing module -- Using scripting to directly compile a module -- How to compose both approaches -- Saving and loading TorchScript modules - -We hope that after you complete this tutorial, you will proceed to go through -`the follow-on tutorial `_ -which will walk you through an example of actually calling a TorchScript -model from C++. - -""" - -import torch # This is all you need to use both PyTorch and TorchScript! -print(torch.__version__) -torch.manual_seed(191009) # set the seed for reproducibility - - -###################################################################### -# Basics of PyTorch Model Authoring -# --------------------------------- -# -# Let’s start out by defining a simple ``Module``. A ``Module`` is the -# basic unit of composition in PyTorch. It contains: -# -# 1. A constructor, which prepares the module for invocation -# 2. A set of ``Parameters`` and sub-\ ``Modules``. These are initialized -# by the constructor and can be used by the module during invocation. -# 3. A ``forward`` function. This is the code that is run when the module -# is invoked. -# -# Let’s examine a small example: -# - -class MyCell(torch.nn.Module): - def __init__(self): - super(MyCell, self).__init__() - - def forward(self, x, h): - new_h = torch.tanh(x + h) - return new_h, new_h - -my_cell = MyCell() -x = torch.rand(3, 4) -h = torch.rand(3, 4) -print(my_cell(x, h)) - - -###################################################################### -# So we’ve: -# -# 1. Created a class that subclasses ``torch.nn.Module``. -# 2. Defined a constructor. The constructor doesn’t do much, just calls -# the constructor for ``super``. -# 3. Defined a ``forward`` function, which takes two inputs and returns -# two outputs. The actual contents of the ``forward`` function are not -# really important, but it’s sort of a fake `RNN -# cell `__–that -# is–it’s a function that is applied on a loop. -# -# We instantiated the module, and made ``x`` and ``h``, which are just 3x4 -# matrices of random values. Then we invoked the cell with -# ``my_cell(x, h)``. This in turn calls our ``forward`` function. -# -# Let’s do something a little more interesting: -# - -class MyCell(torch.nn.Module): - def __init__(self): - super(MyCell, self).__init__() - self.linear = torch.nn.Linear(4, 4) - - def forward(self, x, h): - new_h = torch.tanh(self.linear(x) + h) - return new_h, new_h - -my_cell = MyCell() -print(my_cell) -print(my_cell(x, h)) - - -###################################################################### -# We’ve redefined our module ``MyCell``, but this time we’ve added a -# ``self.linear`` attribute, and we invoke ``self.linear`` in the forward -# function. -# -# What exactly is happening here? ``torch.nn.Linear`` is a ``Module`` from -# the PyTorch standard library. Just like ``MyCell``, it can be invoked -# using the call syntax. We are building a hierarchy of ``Module``\ s. -# -# ``print`` on a ``Module`` will give a visual representation of the -# ``Module``\ ’s subclass hierarchy. In our example, we can see our -# ``Linear`` subclass and its parameters. -# -# By composing ``Module``\ s in this way, we can succinctly and readably -# author models with reusable components. -# -# You may have noticed ``grad_fn`` on the outputs. This is a detail of -# PyTorch’s method of automatic differentiation, called -# `autograd `__. -# In short, this system allows us to compute derivatives through -# potentially complex programs. The design allows for a massive amount of -# flexibility in model authoring. -# -# Now let’s examine said flexibility: -# - -class MyDecisionGate(torch.nn.Module): - def forward(self, x): - if x.sum() > 0: - return x - else: - return -x - -class MyCell(torch.nn.Module): - def __init__(self): - super(MyCell, self).__init__() - self.dg = MyDecisionGate() - self.linear = torch.nn.Linear(4, 4) - - def forward(self, x, h): - new_h = torch.tanh(self.dg(self.linear(x)) + h) - return new_h, new_h - -my_cell = MyCell() -print(my_cell) -print(my_cell(x, h)) - - -###################################################################### -# We’ve once again redefined our ``MyCell`` class, but here we’ve defined -# ``MyDecisionGate``. This module utilizes **control flow**. Control flow -# consists of things like loops and ``if``-statements. -# -# Many frameworks take the approach of computing symbolic derivatives -# given a full program representation. However, in PyTorch, we use a -# gradient tape. We record operations as they occur, and replay them -# backwards in computing derivatives. In this way, the framework does not -# have to explicitly define derivatives for all constructs in the -# language. -# -# .. figure:: https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif -# :alt: How autograd works -# -# How autograd works -# - - -###################################################################### -# Basics of TorchScript -# --------------------- -# -# Now let’s take our running example and see how we can apply TorchScript. -# -# In short, TorchScript provides tools to capture the definition of your -# model, even in light of the flexible and dynamic nature of PyTorch. -# Let’s begin by examining what we call **tracing**. -# -# Tracing ``Modules`` -# ~~~~~~~~~~~~~~~~~~~ -# - -class MyCell(torch.nn.Module): - def __init__(self): - super(MyCell, self).__init__() - self.linear = torch.nn.Linear(4, 4) - - def forward(self, x, h): - new_h = torch.tanh(self.linear(x) + h) - return new_h, new_h - -my_cell = MyCell() -x, h = torch.rand(3, 4), torch.rand(3, 4) -traced_cell = torch.jit.trace(my_cell, (x, h)) -print(traced_cell) -traced_cell(x, h) - - -###################################################################### -# We’ve rewinded a bit and taken the second version of our ``MyCell`` -# class. As before, we’ve instantiated it, but this time, we’ve called -# ``torch.jit.trace``, passed in the ``Module``, and passed in *example -# inputs* the network might see. -# -# What exactly has this done? It has invoked the ``Module``, recorded the -# operations that occurred when the ``Module`` was run, and created an -# instance of ``torch.jit.ScriptModule`` (of which ``TracedModule`` is an -# instance) -# -# TorchScript records its definitions in an Intermediate Representation -# (or IR), commonly referred to in Deep learning as a *graph*. We can -# examine the graph with the ``.graph`` property: -# - -print(traced_cell.graph) - - -###################################################################### -# However, this is a very low-level representation and most of the -# information contained in the graph is not useful for end users. Instead, -# we can use the ``.code`` property to give a Python-syntax interpretation -# of the code: -# - -print(traced_cell.code) - - -###################################################################### -# So **why** did we do all this? There are several reasons: -# -# 1. TorchScript code can be invoked in its own interpreter, which is -# basically a restricted Python interpreter. This interpreter does not -# acquire the Global Interpreter Lock, and so many requests can be -# processed on the same instance simultaneously. -# 2. This format allows us to save the whole model to disk and load it -# into another environment, such as in a server written in a language -# other than Python -# 3. TorchScript gives us a representation in which we can do compiler -# optimizations on the code to provide more efficient execution -# 4. TorchScript allows us to interface with many backend/device runtimes -# that require a broader view of the program than individual operators. -# -# We can see that invoking ``traced_cell`` produces the same results as -# the Python module: -# - -print(my_cell(x, h)) -print(traced_cell(x, h)) - - -###################################################################### -# Using Scripting to Convert Modules -# ---------------------------------- -# -# There’s a reason we used version two of our module, and not the one with -# the control-flow-laden submodule. Let’s examine that now: -# - -class MyDecisionGate(torch.nn.Module): - def forward(self, x): - if x.sum() > 0: - return x - else: - return -x - -class MyCell(torch.nn.Module): - def __init__(self, dg): - super(MyCell, self).__init__() - self.dg = dg - self.linear = torch.nn.Linear(4, 4) - - def forward(self, x, h): - new_h = torch.tanh(self.dg(self.linear(x)) + h) - return new_h, new_h - -my_cell = MyCell(MyDecisionGate()) -traced_cell = torch.jit.trace(my_cell, (x, h)) - -print(traced_cell.dg.code) -print(traced_cell.code) - - -###################################################################### -# Looking at the ``.code`` output, we can see that the ``if-else`` branch -# is nowhere to be found! Why? Tracing does exactly what we said it would: -# run the code, record the operations *that happen* and construct a -# ``ScriptModule`` that does exactly that. Unfortunately, things like control -# flow are erased. -# -# How can we faithfully represent this module in TorchScript? We provide a -# **script compiler**, which does direct analysis of your Python source -# code to transform it into TorchScript. Let’s convert ``MyDecisionGate`` -# using the script compiler: -# - -scripted_gate = torch.jit.script(MyDecisionGate()) - -my_cell = MyCell(scripted_gate) -scripted_cell = torch.jit.script(my_cell) - -print(scripted_gate.code) -print(scripted_cell.code) - - -###################################################################### -# Hooray! We’ve now faithfully captured the behavior of our program in -# TorchScript. Let’s now try running the program: -# - -# New inputs -x, h = torch.rand(3, 4), torch.rand(3, 4) -print(scripted_cell(x, h)) - - -###################################################################### -# Mixing Scripting and Tracing -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Some situations call for using tracing rather than scripting (e.g. a -# module has many architectural decisions that are made based on constant -# Python values that we would like to not appear in TorchScript). In this -# case, scripting can be composed with tracing: ``torch.jit.script`` will -# inline the code for a traced module, and tracing will inline the code -# for a scripted module. -# -# An example of the first case: -# - -class MyRNNLoop(torch.nn.Module): - def __init__(self): - super(MyRNNLoop, self).__init__() - self.cell = torch.jit.trace(MyCell(scripted_gate), (x, h)) - - def forward(self, xs): - h, y = torch.zeros(3, 4), torch.zeros(3, 4) - for i in range(xs.size(0)): - y, h = self.cell(xs[i], h) - return y, h - -rnn_loop = torch.jit.script(MyRNNLoop()) -print(rnn_loop.code) - - - -###################################################################### -# And an example of the second case: -# - -class WrapRNN(torch.nn.Module): - def __init__(self): - super(WrapRNN, self).__init__() - self.loop = torch.jit.script(MyRNNLoop()) - - def forward(self, xs): - y, h = self.loop(xs) - return torch.relu(y) - -traced = torch.jit.trace(WrapRNN(), (torch.rand(10, 3, 4))) -print(traced.code) - - -###################################################################### -# This way, scripting and tracing can be used when the situation calls for -# each of them and used together. -# -# Saving and Loading models -# ------------------------- -# -# We provide APIs to save and load TorchScript modules to/from disk in an -# archive format. This format includes code, parameters, attributes, and -# debug information, meaning that the archive is a freestanding -# representation of the model that can be loaded in an entirely separate -# process. Let’s save and load our wrapped RNN module: -# - -traced.save('wrapped_rnn.pt') - -loaded = torch.jit.load('wrapped_rnn.pt') - -print(loaded) -print(loaded.code) - - -###################################################################### -# As you can see, serialization preserves the module hierarchy and the -# code we’ve been examining throughout. The model can also be loaded, for -# example, `into -# C++ `__ for -# python-free execution. -# -# Further Reading -# ~~~~~~~~~~~~~~~ -# -# We’ve completed our tutorial! For a more involved demonstration, check -# out the NeurIPS demo for converting machine translation models using -# TorchScript: -# https://colab.research.google.com/drive/1HiICg6jRkBnr5hvK2-VnMi88Vi9pUzEJ -# diff --git a/beginner_source/PyTorch Cheat.md b/beginner_source/PyTorch Cheat.md deleted file mode 100644 index 4f7af63038c..00000000000 --- a/beginner_source/PyTorch Cheat.md +++ /dev/null @@ -1,191 +0,0 @@ ---- -Title: PyTorch Cheat Sheet -PyTorch version: 1.0Pre -Date updated: 7/30/18 - ---- - -# Imports ---------------- -### General - -``` -import torch # root package -from torch.utils.data import Dataset, DataLoader # dataset representation and loading -``` - -### Neural Network API - -``` -import torch.autograd as autograd # computation graph -from torch.autograd import Variable # variable node in computation graph -import torch.nn as nn # neural networks -import torch.nn.functional as F # layers, activations and more -import torch.optim as optim # optimizers e.g. gradient descent, ADAM, etc. -from torch.jit import script, trace # hybrid frontend decorator and tracing jit -``` -See [autograd](https://pytorch.org/docs/stable/autograd.html), [nn](https://pytorch.org/docs/stable/nn.html), [functional](https://pytorch.org/docs/stable/nn.html#torch-nn-functional) and [optim](https://pytorch.org/docs/stable/optim.html) - -### Torchscript and JIT - -``` -torch.jit.trace() # takes your module or function and an example data input, and traces the computational steps that the data encounters as it progresses through the model -@script # decorator used to indicate data-dependent control flow within the code being traced -``` -See [Torchscript](https://pytorch.org/docs/stable/jit.html) - -### ONNX - -``` -torch.onnx.export(model, dummy data, xxxx.proto) # exports an ONNX formatted model using a trained model, dummy data and the desired file name -model = onnx.load("alexnet.proto") # load an ONNX model -onnx.checker.check_model(model) # check that the model IR is well formed -onnx.helper.printable_graph(model.graph) # print a human readable representation of the graph -``` -See [onnx](https://pytorch.org/docs/stable/onnx.html) - -### Vision - -``` -from torchvision import datasets, models, transforms # vision datasets, architectures & transforms -import torchvision.transforms as transforms # composable transforms -``` -See [torchvision](https://pytorch.org/vision/stable/index.html) - -### Distributed Training - -``` -import torch.distributed as dist # distributed communication -from multiprocessing import Process # memory sharing processes -``` -See [distributed](https://pytorch.org/docs/stable/distributed.html) and [multiprocessing](https://pytorch.org/docs/stable/multiprocessing.html) - - -# Tensors --------------------- - -### Creation - -``` -torch.randn(*size) # tensor with independent N(0,1) entries -torch.[ones|zeros](*size) # tensor with all 1's [or 0's] -torch.Tensor(L) # create tensor from [nested] list or ndarray L -x.clone() # clone of x -with torch.no_grad(): # code wrap that stops autograd from tracking tensor history -requires_grad=True # arg, when set to True, tracks computation history for future derivative calculations -``` -See [tensor](https://pytorch.org/docs/stable/tensors.html) - -### Dimensionality - -``` -x.size() # return tuple-like object of dimensions -torch.cat(tensor_seq, dim=0) # concatenates tensors along dim -x.view(a,b,...) # reshapes x into size (a,b,...) -x.view(-1,a) # reshapes x into size (b,a) for some b -x.transpose(a,b) # swaps dimensions a and b -x.permute(*dims) # permutes dimensions -x.unsqueeze(dim) # tensor with added axis -x.unsqueeze(dim=2) # (a,b,c) tensor -> (a,b,1,c) tensor -``` -See [tensor](https://pytorch.org/docs/stable/tensors.html) - -### Algebra - -``` -A.mm(B) # matrix multiplication -A.mv(x) # matrix-vector multiplication -x.t() # matrix transpose -``` -See [math operations](https://pytorch.org/docs/stable/torch.html?highlight=mm#math-operations) - -### GPU Usage - -``` -torch.cuda.is_available() # check for cuda -x.cuda() # move x's data from CPU to GPU and return new object -x.cpu() # move x's data from GPU to CPU and return new object - -if not args.disable_cuda and torch.cuda.is_available(): # device agnostic code and modularity - args.device = torch.device('cuda') # -else: # - args.device = torch.device('cpu') # - -net.to(device) # recursively convert their parameters and buffers to device specific tensors -mytensor.to(device) # copy your tensors to a device (gpu, cpu) -``` -See [cuda](https://pytorch.org/docs/stable/cuda.html) - - -# Deep Learning -``` -nn.Linear(m,n) # fully connected layer from m to n units -nn.ConvXd(m,n,s) # X dimensional conv layer from m to n channels where X⍷{1,2,3} and the kernel size is s -nn.MaxPoolXd(s) # X dimension pooling layer (notation as above) -nn.BatchNorm # batch norm layer -nn.RNN/LSTM/GRU # recurrent layers -nn.Dropout(p=0.5, inplace=False) # dropout layer for any dimensional input -nn.Dropout2d(p=0.5, inplace=False) # 2-dimensional channel-wise dropout -nn.Embedding(num_embeddings, embedding_dim) # (tensor-wise) mapping from indices to embedding vectors -``` -See [nn](https://pytorch.org/docs/stable/nn.html) - -### Loss Functions - -``` -nn.X # where X is BCELoss, CrossEntropyLoss, L1Loss, MSELoss, NLLLoss, SoftMarginLoss, MultiLabelSoftMarginLoss, CosineEmbeddingLoss, KLDivLoss, MarginRankingLoss, HingeEmbeddingLoss or CosineEmbeddingLoss -``` -See [loss functions](https://pytorch.org/docs/stable/nn.html#loss-functions) - -### Activation Functions - -``` -nn.X # where X is ReLU, ReLU6, ELU, SELU, PReLU, LeakyReLU, Threshold, HardTanh, Sigmoid, Tanh, LogSigmoid, Softplus, SoftShrink, Softsign, TanhShrink, Softmin, Softmax, Softmax2d or LogSoftmax -``` -See [activation functions](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity) - -### Optimizers - -``` -opt = optim.x(model.parameters(), ...) # create optimizer -opt.step() # update weights -optim.X # where X is SGD, Adadelta, Adagrad, Adam, SparseAdam, Adamax, ASGD, LBFGS, RMSProp or Rprop -``` -See [optimizers](https://pytorch.org/docs/stable/optim.html) - -### Learning rate scheduling - -``` -scheduler = optim.X(optimizer,...) # create lr scheduler -scheduler.step() # update lr at start of epoch -optim.lr_scheduler.X # where X is LambdaLR, StepLR, MultiStepLR, ExponentialLR or ReduceLROnPLateau -``` -See [learning rate scheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) - - -# Data Utilities - -### Datasets - -``` -Dataset # abstract class representing dataset -TensorDataset # labelled dataset in the form of tensors -ConcatDataset # concatenation of Datasets -``` -See [datasets](https://pytorch.org/docs/stable/data.html?highlight=dataset#torch.utils.data.Dataset) - -### Dataloaders and DataSamplers - -``` -DataLoader(dataset, batch_size=1, ...) # loads data batches agnostic of structure of individual data points -sampler.Sampler(dataset,...) # abstract class dealing with ways to sample from dataset -sampler.XSampler # where X is Sequential, Random, Subset, WeightedRandom or Distributed -``` -See [dataloader](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader) - - -## Also see - -* [Deep Learning with PyTorch: A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) _(pytorch.org)_ -* [PyTorch Forums](https://discuss.pytorch.org/) _(discuss.pytorch.org)_ -* [PyTorch for Numpy users](https://github.com/wkentaro/pytorch-for-numpy-users) _(github.com/wkentaro/pytorch-for-numpy-users)_ diff --git a/beginner_source/README.txt b/beginner_source/README.txt index 5017c80b86b..14f3b3047e9 100644 --- a/beginner_source/README.txt +++ b/beginner_source/README.txt @@ -20,7 +20,3 @@ Beginner Tutorials 5. nlp/* and deep_learning_nlp_tutorial.rst Deep Learning for NLP with Pytorch https://pytorch.org/tutorials/beginner/deep_learning_nlp_tutorial.html - -6. transformer_translation.py - Language Translation with Transformers - https://pytorch.org/tutorials/beginner/translation_transformer.html diff --git a/beginner_source/audio_io_tutorial.rst b/beginner_source/audio_io_tutorial.rst deleted file mode 100644 index 3263ad93a98..00000000000 --- a/beginner_source/audio_io_tutorial.rst +++ /dev/null @@ -1,10 +0,0 @@ -Audio I/O -========= - -This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html - -It will redirect in 3 seconds. - -.. raw:: html - - diff --git a/beginner_source/basics/README.txt b/beginner_source/basics/README.txt index 23d6282fd45..d247c7253ea 100644 --- a/beginner_source/basics/README.txt +++ b/beginner_source/basics/README.txt @@ -13,7 +13,7 @@ Learn the Basics Tensors https://pytorch.org/tutorials/beginner/basics/tensor_tutorial.html -4. dataquickstart_tutorial.py +4. data_tutorial.py Datasets & DataLoaders https://pytorch.org/tutorials/beginner/basics/data_tutorial.html diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py index 987bc7c44a2..1806e80feb5 100644 --- a/beginner_source/basics/buildmodel_tutorial.py +++ b/beginner_source/basics/buildmodel_tutorial.py @@ -32,17 +32,10 @@ ############################################# # Get Device for Training # ----------------------- -# We want to be able to train our model on a hardware accelerator like the GPU or MPS, -# if available. Let's check to see if `torch.cuda `_ -# or `torch.backends.mps `_ are available, otherwise we use the CPU. - -device = ( - "cuda" - if torch.cuda.is_available() - else "mps" - if torch.backends.mps.is_available() - else "cpu" -) +# We want to be able to train our model on an `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU. + +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" print(f"Using {device} device") ############################################## diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py index 561e9723fde..2c46b33122a 100644 --- a/beginner_source/basics/data_tutorial.py +++ b/beginner_source/basics/data_tutorial.py @@ -120,7 +120,7 @@ import os import pandas as pd -from torchvision.io import read_image +from torchvision.io import decode_image class CustomImageDataset(Dataset): def __init__(self, annotations_file, img_dir, transform=None, target_transform=None): @@ -134,7 +134,7 @@ def __len__(self): def __getitem__(self, idx): img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0]) - image = read_image(img_path) + image = decode_image(img_path) label = self.img_labels.iloc[idx, 1] if self.transform: image = self.transform(image) @@ -184,13 +184,13 @@ def __len__(self): # ^^^^^^^^^^^^^^^^^^^^ # # The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``. -# Based on the index, it identifies the image's location on disk, converts that to a tensor using ``read_image``, retrieves the +# Based on the index, it identifies the image's location on disk, converts that to a tensor using ``decode_image``, retrieves the # corresponding label from the csv data in ``self.img_labels``, calls the transform functions on them (if applicable), and returns the # tensor image and corresponding label in a tuple. def __getitem__(self, idx): img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0]) - image = read_image(img_path) + image = decode_image(img_path) label = self.img_labels.iloc[idx, 1] if self.transform: image = self.transform(image) diff --git a/beginner_source/basics/intro.py b/beginner_source/basics/intro.py index bc0d3d72a2e..30ff5e17ff6 100644 --- a/beginner_source/basics/intro.py +++ b/beginner_source/basics/intro.py @@ -13,9 +13,9 @@ =================== Authors: -`Suraj Subramanian `_, +`Suraj Subramanian `_, `Seth Juarez `_, -`Cassie Breviu `_, +`Cassie Breviu `_, `Dmitry Soshnikov `_, `Ari Bornstein `_ @@ -43,12 +43,22 @@ If you're familiar with other deep learning frameworks, check out the `0. Quickstart `_ first to quickly familiarize yourself with PyTorch's API. -If you're new to deep learning frameworks, head right into the first section of our step-by-step guide: `1. Tensors `_. +If you're new to deep learning frameworks, head right into the first section of our step-by-step guide: `1. Tensors `_. .. include:: /beginner_source/basics/qs_toc.txt .. toctree:: + :maxdepth: 2 :hidden: + quickstart_tutorial + tensorqs_tutorial + data_tutorial + transforms_tutorial + buildmodel_tutorial + autogradqs_tutorial + optimization_tutorial + saveloadrun_tutorial + """ diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py index c6c327f8511..82bfaa8f07c 100644 --- a/beginner_source/basics/optimization_tutorial.py +++ b/beginner_source/basics/optimization_tutorial.py @@ -76,7 +76,7 @@ def forward(self, x): # (`read more `__ about hyperparameter tuning) # # We define the following hyperparameters for training: -# - **Number of Epochs** - the number times to iterate over the dataset +# - **Number of Epochs** - the number of times to iterate over the dataset # - **Batch Size** - the number of data samples propagated through the network before the parameters are updated # - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training. # diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py index 07a1be517d1..5cce8dcfe9a 100644 --- a/beginner_source/basics/quickstart_tutorial.py +++ b/beginner_source/basics/quickstart_tutorial.py @@ -84,16 +84,10 @@ # To define a neural network in PyTorch, we create a class that inherits # from `nn.Module `_. We define the layers of the network # in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate -# operations in the neural network, we move it to the GPU or MPS if available. - -# Get cpu, gpu or mps device for training. -device = ( - "cuda" - if torch.cuda.is_available() - else "mps" - if torch.backends.mps.is_available() - else "cpu" -) +# operations in the neural network, we move it to the `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU. + +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" print(f"Using {device} device") # Define model @@ -216,7 +210,7 @@ def test(dataloader, model, loss_fn): # the state dictionary into it. model = NeuralNetwork().to(device) -model.load_state_dict(torch.load("model.pth")) +model.load_state_dict(torch.load("model.pth", weights_only=True)) ############################################################# # This model can now be used to make predictions. diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py index 16a9f037417..e80d32a6eaa 100644 --- a/beginner_source/basics/saveloadrun_tutorial.py +++ b/beginner_source/basics/saveloadrun_tutorial.py @@ -32,9 +32,14 @@ ########################## # To load model weights, you need to create an instance of the same model first, and then load the parameters # using ``load_state_dict()`` method. +# +# In the code below, we set ``weights_only=True`` to limit the +# functions executed during unpickling to only those necessary for +# loading weights. Using ``weights_only=True`` is considered +# a best practice when loading weights. model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model -model.load_state_dict(torch.load('model_weights.pth')) +model.load_state_dict(torch.load('model_weights.pth', weights_only=True)) model.eval() ########################### @@ -50,9 +55,14 @@ torch.save(model, 'model.pth') ######################## -# We can then load the model like this: +# We can then load the model as demonstrated below. +# +# As described in `Saving and loading torch.nn.Modules `_, +# saving ``state_dict`` is considered the best practice. However, +# below we use ``weights_only=False`` because this involves loading the +# model, which is a legacy use case for ``torch.save``. -model = torch.load('model.pth') +model = torch.load('model.pth', weights_only=False) ######################## # .. note:: This approach uses Python `pickle `_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model. diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py index 70a966d9f89..30e05cb10d0 100644 --- a/beginner_source/basics/tensorqs_tutorial.py +++ b/beginner_source/basics/tensorqs_tutorial.py @@ -99,20 +99,20 @@ # Operations on Tensors # ~~~~~~~~~~~~~~~~~~~~~~~ # -# Over 100 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing, +# Over 1200 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing, # indexing, slicing), sampling and more are # comprehensively described `here `__. # -# Each of these operations can be run on the GPU (at typically higher speeds than on a -# CPU). If you’re using Colab, allocate a GPU by going to Runtime > Change runtime type > GPU. +# Each of these operations can be run on the CPU and `Accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If you’re using Colab, allocate an accelerator by going to Runtime > Change runtime type > GPU. # -# By default, tensors are created on the CPU. We need to explicitly move tensors to the GPU using -# ``.to`` method (after checking for GPU availability). Keep in mind that copying large tensors +# By default, tensors are created on the CPU. We need to explicitly move tensors to the accelerator using +# ``.to`` method (after checking for accelerator availability). Keep in mind that copying large tensors # across devices can be expensive in terms of time and memory! -# We move our tensor to the GPU if available -if torch.cuda.is_available(): - tensor = tensor.to("cuda") +# We move our tensor to the current accelerator if available +if torch.accelerator.is_available(): + tensor = tensor.to(torch.accelerator.current_accelerator()) ###################################################################### diff --git a/beginner_source/bettertransformer_tutorial.rst b/beginner_source/bettertransformer_tutorial.rst index 60ffa52ea83..76aebd839a0 100644 --- a/beginner_source/bettertransformer_tutorial.rst +++ b/beginner_source/bettertransformer_tutorial.rst @@ -1,251 +1,10 @@ Fast Transformer Inference with Better Transformer -=============================================================== +================================================== -**Author**: `Michael Gschwind `__ +This tutorial has been deprecated. -This tutorial introduces Better Transformer (BT) as part of the PyTorch 1.12 release. -In this tutorial, we show how to use Better Transformer for production -inference with torchtext. Better Transformer is a production ready fastpath to -accelerate deployment of Transformer models with high performance on CPU and GPU. -The fastpath feature works transparently for models based either directly on -PyTorch core ``nn.module`` or with torchtext. - -Models which can be accelerated by Better Transformer fastpath execution are those -using the following PyTorch core ``torch.nn.module`` classes ``TransformerEncoder``, -``TransformerEncoderLayer``, and ``MultiHeadAttention``. In addition, torchtext has -been updated to use the core library modules to benefit from fastpath acceleration. -(Additional modules may be enabled with fastpath execution in the future.) - -Better Transformer offers two types of acceleration: - -* Native multihead attention (MHA) implementation for CPU and GPU to improve overall execution efficiency. -* Exploiting sparsity in NLP inference. Because of variable input lengths, input - tokens may contain a large number of padding tokens for which processing may be - skipped, delivering significant speedups. - -Fastpath execution is subject to some criteria. Most importantly, the model -must be executed in inference mode and operate on input tensors that do not collect -gradient tape information (e.g., running with torch.no_grad). - -To follow this example in Google Colab, `click here -`__. - -Better Transformer Features in This Tutorial --------------------------------------------- - -* Load pretrained models (created before PyTorch version 1.12 without Better Transformer) -* Run and benchmark inference on CPU with and without BT fastpath (native MHA only) -* Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA only) -* Enable sparsity support -* Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA + sparsity) - -Additional Information ------------------------ -Additional information about Better Transformer may be found in the PyTorch.Org blog -`A Better Transformer for Fast Transformer Inference -`__. - - - -1. Setup - -1.1 Load pretrained models - -We download the XLM-R model from the predefined torchtext models by following the instructions in -`torchtext.models `__. We also set the DEVICE to execute -on-accelerator tests. (Enable GPU execution for your environment as appropriate.) - -.. code-block:: python - - import torch - import torch.nn as nn - - print(f"torch version: {torch.__version__}") - - DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - - print(f"torch cuda available: {torch.cuda.is_available()}") - - import torch, torchtext - from torchtext.models import RobertaClassificationHead - from torchtext.functional import to_tensor - xlmr_large = torchtext.models.XLMR_LARGE_ENCODER - classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024) - model = xlmr_large.get_model(head=classifier_head) - transform = xlmr_large.transform() - -1.2 Dataset Setup - -We set up two types of inputs: a small input batch and a big input batch with sparsity. - -.. code-block:: python - - small_input_batch = [ - "Hello world", - "How are you!" - ] - big_input_batch = [ - "Hello world", - "How are you!", - """`Well, Prince, so Genoa and Lucca are now just family estates of the - Buonapartes. But I warn you, if you don't tell me that this means war, - if you still try to defend the infamies and horrors perpetrated by - that Antichrist- I really believe he is Antichrist- I will have - nothing more to do with you and you are no longer my friend, no longer - my 'faithful slave,' as you call yourself! But how do you do? I see - I have frightened you- sit down and tell me all the news.` - - It was in July, 1805, and the speaker was the well-known Anna - Pavlovna Scherer, maid of honor and favorite of the Empress Marya - Fedorovna. With these words she greeted Prince Vasili Kuragin, a man - of high rank and importance, who was the first to arrive at her - reception. Anna Pavlovna had had a cough for some days. She was, as - she said, suffering from la grippe; grippe being then a new word in - St. Petersburg, used only by the elite.""" - ] - -Next, we select either the small or large input batch, preprocess the inputs and test the model. - -.. code-block:: python - - input_batch=big_input_batch - - model_input = to_tensor(transform(input_batch), padding_value=1) - output = model(model_input) - output.shape - -Finally, we set the benchmark iteration count: - -.. code-block:: python - - ITERATIONS=10 - -2. Execution - -2.1 Run and benchmark inference on CPU with and without BT fastpath (native MHA only) - -We run the model on CPU, and collect profile information: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` and disables gradient collection with `torch.no_grad()`. - -You can see an improvement (whose magnitude will depend on the CPU model) when the model is executing on CPU. Notice that the fastpath profile shows most of the execution time -in the native `TransformerEncoderLayer` implementation `aten::_transformer_encoder_layer_fwd`. - -.. code-block:: python - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=False) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=False) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -2.2 Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA only) - -We check the BT sparsity setting: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor - - -We disable the BT sparsity: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor=False - - -We run the model on DEVICE, and collect profile information for native MHA execution on DEVICE: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` - and disables gradient collection with `torch.no_grad()`. - -When executing on a GPU, you should see a significant speedup, in particular for the small input batch setting: - -.. code-block:: python - - model.to(DEVICE) - model_input = model_input.to(DEVICE) - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -2.3 Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA + sparsity) - -We enable sparsity support: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor = True - -We run the model on DEVICE, and collect profile information for native MHA and sparsity support execution on DEVICE: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` and disables gradient collection with `torch.no_grad()`. - -When executing on a GPU, you should see a significant speedup, in particular for the large input batch setting which includes sparsity: - -.. code-block:: python - - model.to(DEVICE) - model_input = model_input.to(DEVICE) - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -Summary -------- - -In this tutorial, we have introduced fast transformer inference with -Better Transformer fastpath execution in torchtext using PyTorch core -Better Transformer support for Transformer Encoder models. We have -demonstrated the use of Better Transformer with models trained prior to -the availability of BT fastpath execution. We have demonstrated and -benchmarked the use of both BT fastpath execution modes, native MHA execution -and BT sparsity acceleration. +Redirecting in 3 seconds... +.. raw:: html + diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py index 2c0d5909685..d4d0f96816e 100644 --- a/beginner_source/blitz/autograd_tutorial.py +++ b/beginner_source/blitz/autograd_tutorial.py @@ -67,7 +67,7 @@ loss.backward() # backward pass ############################################################ -# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and `momentum `__ of 0.9. +# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and `momentum `__ of 0.9. # We register all the parameters of the model in the optimizer. # @@ -191,7 +191,7 @@ # .. math:: # # -# J^{T}\cdot \vec{v}=\left(\begin{array}{ccc} +# J^{T}\cdot \vec{v} = \left(\begin{array}{ccc} # \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\ # \vdots & \ddots & \vdots\\ # \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}} @@ -199,7 +199,7 @@ # \frac{\partial l}{\partial y_{1}}\\ # \vdots\\ # \frac{\partial l}{\partial y_{m}} -# \end{array}\right)=\left(\begin{array}{c} +# \end{array}\right) = \left(\begin{array}{c} # \frac{\partial l}{\partial x_{1}}\\ # \vdots\\ # \frac{\partial l}{\partial x_{n}} @@ -207,7 +207,6 @@ # # This characteristic of vector-Jacobian product is what we use in the above example; # ``external_grad`` represents :math:`\vec{v}`. -# diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py index 8e3f3252921..8f19f5964c6 100644 --- a/beginner_source/blitz/cifar10_tutorial.py +++ b/beginner_source/blitz/cifar10_tutorial.py @@ -65,7 +65,8 @@ ######################################################################## # .. note:: -# If running on Windows and you get a BrokenPipeError, try setting +# If you are running this tutorial on Windows or MacOS and encounter a +# BrokenPipeError or RuntimeError related to multiprocessing, try setting # the num_worker of torch.utils.data.DataLoader() to 0. transform = transforms.Compose( @@ -221,7 +222,7 @@ def forward(self, x): # wasn't necessary here, we only did it to illustrate how to do so): net = Net() -net.load_state_dict(torch.load(PATH)) +net.load_state_dict(torch.load(PATH, weights_only=True)) ######################################################################## # Okay, now let us see what the neural network thinks these examples above are: @@ -252,7 +253,7 @@ def forward(self, x): # calculate outputs by running images through the network outputs = net(images) # the class with the highest energy is what we choose as prediction - _, predicted = torch.max(outputs.data, 1) + _, predicted = torch.max(outputs, 1) total += labels.size(0) correct += (predicted == labels).sum().item() diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py index 44310cc3620..520c934d965 100644 --- a/beginner_source/chatbot_tutorial.py +++ b/beginner_source/chatbot_tutorial.py @@ -84,8 +84,7 @@ # Preparations # ------------ # -# To start, Download the data ZIP file -# `here `__ +# To get started, `download `__ the Movie-Dialogs Corpus zip file. # and put in a ``data/`` directory under the current directory. # @@ -109,8 +108,10 @@ import json -USE_CUDA = torch.cuda.is_available() -device = torch.device("cuda" if USE_CUDA else "cpu") +# If the current `accelerator `__ is available, +# we will use it. Otherwise, we use the CPU. +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") ###################################################################### @@ -1129,7 +1130,7 @@ def forward(self, input_seq, input_length, max_length): # Forward input through encoder model encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length) # Prepare encoder's final hidden layer to be first hidden input to the decoder - decoder_hidden = encoder_hidden[:decoder.n_layers] + decoder_hidden = encoder_hidden[:self.decoder.n_layers] # Initialize decoder input with SOS_token decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token # Initialize tensors to append decoded words to @@ -1319,16 +1320,16 @@ def evaluateInput(encoder, decoder, searcher, voc): encoder_optimizer.load_state_dict(encoder_optimizer_sd) decoder_optimizer.load_state_dict(decoder_optimizer_sd) -# If you have CUDA, configure CUDA to call +# If you have an accelerator, configure it to call for state in encoder_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): - state[k] = v.cuda() + state[k] = v.to(device) for state in decoder_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): - state[k] = v.cuda() + state[k] = v.to(device) # Run training iterations print("Starting Training!") diff --git a/beginner_source/colab.rst b/beginner_source/colab.rst index 812255704e7..e5106a2c81a 100644 --- a/beginner_source/colab.rst +++ b/beginner_source/colab.rst @@ -10,7 +10,7 @@ run PyTorch tutorials in Google Colab. PyTorch Version in Google Colab ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When you are running a tutorial that requires a version of PyTorch that has +Wen you are running a tutorial that requires a version of PyTorch that has just been released, that version might not be yet available in Google Colab. To check that you have the required ``torch`` and compatible domain libraries installed, run ``!pip list``. diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py index ab9de0d7d73..8f21bb7bff6 100644 --- a/beginner_source/data_loading_tutorial.py +++ b/beginner_source/data_loading_tutorial.py @@ -445,7 +445,7 @@ def show_landmarks_batch(sample_batched): # from torchvision import transforms, datasets # # data_transform = transforms.Compose([ -# transforms.RandomSizedCrop(224), +# transforms.RandomResizedCrop(224), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize(mean=[0.485, 0.456, 0.406], diff --git a/beginner_source/ddp_series_fault_tolerance.rst b/beginner_source/ddp_series_fault_tolerance.rst index 2bb0d528d1b..27fe7e273e7 100644 --- a/beginner_source/ddp_series_fault_tolerance.rst +++ b/beginner_source/ddp_series_fault_tolerance.rst @@ -9,7 +9,7 @@ Fault-tolerant Distributed Training with ``torchrun`` ===================================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 diff --git a/beginner_source/ddp_series_intro.rst b/beginner_source/ddp_series_intro.rst index 527a3cc1ce0..9aee5d8a5df 100644 --- a/beginner_source/ddp_series_intro.rst +++ b/beginner_source/ddp_series_intro.rst @@ -7,7 +7,7 @@ Distributed Data Parallel in PyTorch - Video Tutorials ====================================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ Follow along with the video below or on `youtube `__. diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst index f8335ba8cf4..ef6549d4de0 100644 --- a/beginner_source/ddp_series_multigpu.rst +++ b/beginner_source/ddp_series_multigpu.rst @@ -9,7 +9,7 @@ Multi GPU training with DDP =========================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 @@ -19,13 +19,13 @@ Authors: `Suraj Subramanian `__ - How to migrate a single-GPU training script to multi-GPU via DDP - Setting up the distributed process group - Saving and loading models in a distributed setup - + .. grid:: 1 .. grid-item:: :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ - + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites :class-card: card-prerequisites @@ -45,11 +45,11 @@ In the `previous tutorial `__, we got a high-level overv In this tutorial, we start with a single-GPU training script and migrate that to running it on 4 GPUs on a single node. Along the way, we will talk through important concepts in distributed training while implementing them in our code. -.. note:: +.. note:: If your model contains any ``BatchNorm`` layers, it needs to be converted to ``SyncBatchNorm`` to sync the running stats of ``BatchNorm`` layers across replicas. - Use the helper function + Use the helper function `torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) `__ to convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``. @@ -58,27 +58,27 @@ Diff for `single_gpu.py `__, which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0` @@ -90,66 +90,66 @@ Constructing the process group - Read more about `choosing a DDP backend `__ -.. code-block:: diff +.. code-block:: python - + def ddp_setup(rank: int, world_size: int): - + """ - + Args: - + rank: Unique identifier of each process - + world_size: Total number of processes - + """ - + os.environ["MASTER_ADDR"] = "localhost" - + os.environ["MASTER_PORT"] = "12355" - + torch.cuda.set_device(rank) - + init_process_group(backend="nccl", rank=rank, world_size=world_size) + def ddp_setup(rank: int, world_size: int): + """ + Args: + rank: Unique identifier of each process + world_size: Total number of processes + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.cuda.set_device(rank) + init_process_group(backend="nccl", rank=rank, world_size=world_size) Constructing the DDP model -~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------- -.. code-block:: diff +.. code-block:: python - - self.model = model.to(gpu_id) - + self.model = DDP(model, device_ids=[gpu_id]) + self.model = DDP(model, device_ids=[gpu_id]) Distributing input data -~~~~~~~~~~~~~~~~~~~~~~~ +----------------------- - `DistributedSampler `__ chunks the input data across all distributed processes. +- The `DataLoader `__ combines a dataset and a + sampler, and provides an iterable over the given dataset. - Each process will receive an input batch of 32 samples; the effective batch size is ``32 * nprocs``, or 128 when using 4 GPUs. -.. code-block:: diff +.. code-block:: python train_data = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=32, - - shuffle=True, - + shuffle=False, - + sampler=DistributedSampler(train_dataset), + shuffle=False, # We don't shuffle + sampler=DistributedSampler(train_dataset), # Use the Distributed Sampler here. ) -- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work +- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work properly across multiple epochs. Otherwise, the same ordering will be used in each epoch. -.. code-block:: diff +.. code-block:: python def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - + self.train_data.sampler.set_epoch(epoch) + self.train_data.sampler.set_epoch(epoch) # call this additional line at every epoch for source, targets in self.train_data: ... self._run_batch(source, targets) Saving model checkpoints -~~~~~~~~~~~~~~~~~~~~~~~~ -- We only need to save model checkpoints from one process. Without this +------------------------ +- We only need to save model checkpoints from one process. Without this condition, each process would save its copy of the identical mode. Read more on saving and loading models with - DDP `here `__ + DDP `here `__ .. code-block:: diff @@ -164,18 +164,18 @@ Saving model checkpoints .. warning:: `Collective calls `__ are functions that run on all the distributed processes, and they are used to gather certain states or values to a specific process. Collective calls require all ranks to run the collective code. - In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process. + In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process. If you need to make any collective calls, it should be before the ``if self.gpu_id == 0`` check. Running the distributed training job -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ - Include new arguments ``rank`` (replacing ``device``) and ``world_size``. - ``rank`` is auto-allocated by DDP when calling `mp.spawn `__. -- ``world_size`` is the number of processes across the training job. For GPU training, +- ``world_size`` is the number of processes across the training job. For GPU training, this corresponds to the number of GPUs in use, and each process works on a dedicated GPU. .. code-block:: diff @@ -189,7 +189,7 @@ Running the distributed training job + trainer = Trainer(model, train_data, optimizer, rank, save_every) trainer.train(total_epochs) + destroy_process_group() - + if __name__ == "__main__": import sys total_epochs = int(sys.argv[1]) @@ -199,6 +199,24 @@ Running the distributed training job + world_size = torch.cuda.device_count() + mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size) +Here's what the code looks like: + +.. code-block:: python + def main(rank, world_size, total_epochs, save_every): + ddp_setup(rank, world_size) + dataset, model, optimizer = load_train_objs() + train_data = prepare_dataloader(dataset, batch_size=32) + trainer = Trainer(model, train_data, optimizer, rank, save_every) + trainer.train(total_epochs) + destroy_process_group() + + if __name__ == "__main__": + import sys + total_epochs = int(sys.argv[1]) + save_every = int(sys.argv[2]) + world_size = torch.cuda.device_count() + mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size) + Further Reading @@ -206,6 +224,6 @@ Further Reading - `Fault Tolerant distributed training `__ (next tutorial in this series) - `Intro to DDP `__ (previous tutorial in this series) -- `Getting Started with DDP `__ +- `Getting Started with DDP `__ - `Process Group - initialization `__ + Initialization `__ diff --git a/beginner_source/ddp_series_theory.rst b/beginner_source/ddp_series_theory.rst index 8957ab6ec4b..ade98d9f01c 100644 --- a/beginner_source/ddp_series_theory.rst +++ b/beginner_source/ddp_series_theory.rst @@ -7,7 +7,7 @@ What is Distributed Data Parallel (DDP) ======================================= -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst index f2fe0e48f15..7ec83477373 100644 --- a/beginner_source/deeplabv3_on_android.rst +++ b/beginner_source/deeplabv3_on_android.rst @@ -1,230 +1,10 @@ Image Segmentation DeepLabV3 on Android ================================================= -**Author**: `Jeff Tang `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -**Reviewed by**: `Jeremiah Chung `_ +Redirecting in 3 seconds... -Introduction ------------- +.. raw:: html -Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model `_ can be used to label image regions with `20 semantic classes `_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding. - -In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on Android, taking you from the beginning of having a model you may want to use on Android to the end of having a complete Android app using the model. We will also cover practical and general tips on how to check if your next favorable pretrained PyTorch models can run on Android, and how to avoid pitfalls. - -.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for Android `_ and give the PyTorch Android `Hello World `_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code for this tutorial is available `here `_. - -Learning Objectives -------------------- - -In this tutorial, you will learn how to: - -1. Convert the DeepLabV3 model for Android deployment. - -2. Get the output of the model for the example input image in Python and compare it to the output from the Android app. - -3. Build a new Android app or reuse an Android example app to load the converted model. - -4. Prepare the input into the format that the model expects and process the model output. - -5. Complete the UI, refactor, build and run the app to see image segmentation in action. - -Prerequisites ---------------- - -* PyTorch 1.6 or 1.7 - -* torchvision 0.7 or 0.8 - -* Android Studio 3.5.1 or above with NDK installed - -Steps ---------- - -1. Convert the DeepLabV3 model for Android deployment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The first step to deploying a model on Android is to convert the model into the `TorchScript `_ format. - -.. note:: - Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details. - -Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`: - -:: - - import torch - - # use deeplabv3_resnet50 instead of resnet101 to reduce the model size - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scriptedm = torch.jit.script(model) - torch.jit.save(scriptedm, "deeplabv3_scripted.pt") - -The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an Android app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization `_ to the DeepLabV3 model in a future tutorial or recipe. - -2. Get example input and output of the model in Python -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on Android. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site `_. Add the following code snippet to the code above: - -:: - - from PIL import Image - from torchvision import transforms - input_image = Image.open("deeplab.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor = preprocess(input_image) - input_batch = input_tensor.unsqueeze(0) - with torch.no_grad(): - output = model(input_batch)['out'][0] - - print(input_batch.shape) - print(output.shape) - -Download `deeplab.jpg` from `here `_, then run the script above and you will see the shapes of the input and output of the model: - -:: - - torch.Size([1, 3, 400, 400]) - torch.Size([21, 400, 400]) - -So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on Android, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the Android app. - -3. Build a new Android app or reuse an example app and load the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -First, follow Step 3 of the `Model Preparation for Android recipe <../recipes/model_preparation_android.html#add-the-model-and-pytorch-library-on-android>`_ to use our model in an Android Studio project with PyTorch Mobile enabled. Because both DeepLabV3 used in this tutorial and MobileNet v2 used in the PyTorch Hello World Android example are computer vision models, you can also get the `Hello World example repo `_ to make it easier to modify the code that loads the model and processes the input and output. The main goal in this step and Step 4 is to make sure the model `deeplabv3_scripted.pt` generated in Step 1 can indeed work correctly on Android. - -Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Android Studio project and modify the `onCreate` method in the `MainActivity` to resemble: - -.. code-block:: java - - Module module = null; - try { - module = Module.load(assetFilePath(this, "deeplabv3_scripted.pt")); - } catch (IOException e) { - Log.e("ImageSegmentation", "Error loading model!", e); - finish(); - } - -Then set a breakpoint at the line `finish()` and build and run the app. If the app doesn't stop at the breakpoint, it means that the scripted model in Step 1 has been successfully loaded on Android. - -4. Process the model input and output for model inference -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image the same as that of the MobileNet v2 in the Hello World example, we will reuse some of the code in the `MainActivity.java `_ file from Hello World for input processing. Replace the code snippet between `line 50 `_ and 73 in `MainActivity.java` with the following code: - -.. code-block:: java - - final Tensor inputTensor = TensorImageUtils.bitmapToFloat32Tensor(bitmap, - TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, - TensorImageUtils.TORCHVISION_NORM_STD_RGB); - final float[] inputs = inputTensor.getDataAsFloatArray(); - - Map outTensors = - module.forward(IValue.from(inputTensor)).toDictStringKey(); - - // the key "out" of the output tensor contains the semantic masks - // see https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101 - final Tensor outputTensor = outTensors.get("out").toTensor(); - final float[] outputs = outputTensor.getDataAsFloatArray(); - - int width = bitmap.getWidth(); - int height = bitmap.getHeight(); - -.. note:: - The model output is a dictionary for the DeepLabV3 model so we use `toDictStringKey` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things. - -With the code changes shown above, you can set breakpoints after `final float[] inputs` and `final float[] outputs`, which populate the input tensor and output tensor data to float arrays for easy debugging. Run the app and when it stops at the breakpoints, compare the numbers in `inputs` and `outputs` with the model input and output data you see in Step 2 to see if they match. For the same inputs to the models running on Android and Python, you should get the same outputs. - -.. warning:: - You may see different model outputs with the same image input when running on an Android emulator due to some Android emulator's floating point implementation issue. So it is best to test the app on a real Android device. - -All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our Android app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project. - -5. Complete the UI, refactor, build and run the app -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4: - -.. code-block:: java - - int[] intValues = new int[width * height]; - // go through each element in the output of size [WIDTH, HEIGHT] and - // set different color for different classnum - for (int j = 0; j < width; j++) { - for (int k = 0; k < height; k++) { - // maxi: the index of the 21 CLASSNUM with the max probability - int maxi = 0, maxj = 0, maxk = 0; - double maxnum = -100000.0; - for (int i=0; i < CLASSNUM; i++) { - if (outputs[i*(width*height) + j*width + k] > maxnum) { - maxnum = outputs[i*(width*height) + j*width + k]; - maxi = i; maxj = j; maxk= k; - } - } - // color coding for person (red), dog (green), sheep (blue) - // black color for background and other classes - if (maxi == PERSON) - intValues[maxj*width + maxk] = 0xFFFF0000; // red - else if (maxi == DOG) - intValues[maxj*width + maxk] = 0xFF00FF00; // green - else if (maxi == SHEEP) - intValues[maxj*width + maxk] = 0xFF0000FF; // blue - else - intValues[maxj*width + maxk] = 0xFF000000; // black - } - } - -The constants used in the code above are defined in the beginning of the class `MainActivity`: - -.. code-block:: java - - private static final int CLASSNUM = 21; - private static final int DOG = 12; - private static final int PERSON = 15; - private static final int SHEEP = 17; - - -The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset. - -After the output processing, you will also need to call the code below to render the RGB `intValues` array to a bitmap instance `outputBitmap` before displaying it on an `ImageView`: - -.. code-block:: java - - Bitmap bmpSegmentation = Bitmap.createScaledBitmap(bitmap, width, height, true); - Bitmap outputBitmap = bmpSegmentation.copy(bmpSegmentation.getConfig(), true); - outputBitmap.setPixels(intValues, 0, outputBitmap.getWidth(), 0, 0, - outputBitmap.getWidth(), outputBitmap.getHeight()); - imageView.setImageBitmap(outputBitmap); - -The UI for this app is also similar to that for Hello World, except that you do not need the `TextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repository to run the model inference and to show back the original image after the segmentation result is shown. - -Now when you run the app on an Android emulator or preferably an actual device, you will see screens like the following: - -.. image:: /_static/img/deeplabv3_android.png - :width: 300 px -.. image:: /_static/img/deeplabv3_android2.png - :width: 300 px - - -Recap --------- - -In this tutorial, we described what it takes to convert a pretrained PyTorch DeepLabV3 model for Android and how to make sure the model can run successfully on Android. Our focus was to help you understand the process of confirming that a model can indeed run on Android. The complete code repository is available `here `_. - -More advanced topics such as quantization and using models via transfer learning or of your own on Android will be covered soon in future demo apps and tutorials. - - -Learn More ------------- - -1. `PyTorch Mobile site `_ -2. `DeepLabV3 model `_ -3. `DeepLabV3 paper `_ + diff --git a/beginner_source/deeplabv3_on_ios.rst b/beginner_source/deeplabv3_on_ios.rst index 5a88c703bd8..66c052419fc 100644 --- a/beginner_source/deeplabv3_on_ios.rst +++ b/beginner_source/deeplabv3_on_ios.rst @@ -1,248 +1,10 @@ Image Segmentation DeepLabV3 on iOS ============================================== -**Author**: `Jeff Tang `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -**Reviewed by**: `Jeremiah Chung `_ +Redirecting in 3 seconds... -Introduction ------------- +.. raw:: html -Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model `_ can be used to label image regions with `20 semantic classes `_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding. - -In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on iOS, taking you from the beginning of having a model you may want to use on iOS to the end of having a complete iOS app using the model. We will also cover practical and general tips on how to check if your next favorite pretrained PyTorch models can run on iOS, and how to avoid pitfalls. - -.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for iOS `_ and give the PyTorch iOS `HelloWorld `_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code for this tutorial is available `here `_. - -Learning Objectives -------------------- - -In this tutorial, you will learn how to: - -1. Convert the DeepLabV3 model for iOS deployment. - -2. Get the output of the model for the example input image in Python and compare it to the output from the iOS app. - -3. Build a new iOS app or reuse an iOS example app to load the converted model. - -4. Prepare the input into the format that the model expects and process the model output. - -5. Complete the UI, refactor, build and run the app to see image segmentation in action. - -Prerequisites ---------------- - -* PyTorch 1.6 or 1.7 - -* torchvision 0.7 or 0.8 - -* Xcode 11 or 12 - -Steps ---------- - - -1. Convert the DeepLabV3 model for iOS deployment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The first step to deploying a model on iOS is to convert the model into the `TorchScript `_ format. - -.. note:: - Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details. - -Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`: - -:: - - import torch - - # use deeplabv3_resnet50 instead of deeplabv3_resnet101 to reduce the model size - model = torch.hub.load('pytorch/vision:v0.8.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scriptedm = torch.jit.script(model) - torch.jit.save(scriptedm, "deeplabv3_scripted.pt") - -The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an iOS app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization `_ to the DeepLabV3 model in a future tutorial or recipe. - -2. Get example input and output of the model in Python -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on iOS. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site `_. Add the following code snippet to the code above: - -:: - - from PIL import Image - from torchvision import transforms - input_image = Image.open("deeplab.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor = preprocess(input_image) - input_batch = input_tensor.unsqueeze(0) - with torch.no_grad(): - output = model(input_batch)['out'][0] - - print(input_batch.shape) - print(output.shape) - -Download `deeplab.jpg` from `here `_ and run the script above to see the shapes of the input and output of the model: - -:: - - torch.Size([1, 3, 400, 400]) - torch.Size([21, 400, 400]) - -So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on iOS, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the iOS app. - -3. Build a new iOS app or reuse an example app and load the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -First, follow Step 3 of the `Model Preparation for iOS recipe <../recipes/model_preparation_ios.html#add-the-model-and-pytorch-library-on-ios>`_ to use our model in an Xcode project with PyTorch Mobile enabled. Because both the DeepLabV3 model used in this tutorial and the MobileNet v2 model used in the PyTorch Hello World iOS example are computer vision models, you may choose to start with the `HelloWorld example repo `_ as a template to reuse the code that loads the model and processes the input and output. - -Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Xcode project and modify `ViewController.swift` to resemble: - -.. code-block:: swift - - class ViewController: UIViewController { - var image = UIImage(named: "deeplab.jpg")! - - override func viewDidLoad() { - super.viewDidLoad() - } - - private lazy var module: TorchModule = { - if let filePath = Bundle.main.path(forResource: "deeplabv3_scripted", - ofType: "pt"), - let module = TorchModule(fileAtPath: filePath) { - return module - } else { - fatalError("Can't load the model file!") - } - }() - } - -Then set a breakpoint at the line `return module` and build and run the app. The app should stop at the breakpoint, meaning that the scripted model in Step 1 has been successfully loaded on iOS. - -4. Process the model input and output for model inference -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image, the same as that of the MobileNet v2 in the Hello World example, we will reuse some of the code in the `TorchModule.mm `_ file from Hello World for input processing. Replace the `predictImage` method implementation in `TorchModule.mm` with the following code: - -.. code-block:: objective-c - - - (unsigned char*)predictImage:(void*)imageBuffer { - // 1. the example deeplab.jpg size is size 400x400 and there are 21 semantic classes - const int WIDTH = 400; - const int HEIGHT = 400; - const int CLASSNUM = 21; - - at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, WIDTH, HEIGHT}, at::kFloat); - torch::autograd::AutoGradMode guard(false); - at::AutoNonVariableTypeMode non_var_type_mode(true); - - // 2. convert the input tensor to an NSMutableArray for debugging - float* floatInput = tensor.data_ptr(); - if (!floatInput) { - return nil; - } - NSMutableArray* inputs = [[NSMutableArray alloc] init]; - for (int i = 0; i < 3 * WIDTH * HEIGHT; i++) { - [inputs addObject:@(floatInput[i])]; - } - - // 3. the output of the model is a dictionary of string and tensor, as - // specified at https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101 - auto outputDict = _impl.forward({tensor}).toGenericDict(); - - // 4. convert the output to another NSMutableArray for easy debugging - auto outputTensor = outputDict.at("out").toTensor(); - float* floatBuffer = outputTensor.data_ptr(); - if (!floatBuffer) { - return nil; - } - NSMutableArray* results = [[NSMutableArray alloc] init]; - for (int i = 0; i < CLASSNUM * WIDTH * HEIGHT; i++) { - [results addObject:@(floatBuffer[i])]; - } - - return nil; - } - -.. note:: - The model output is a dictionary for the DeepLabV3 model so we use `toGenericDict` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things. - -With the code changes shown above, you can set breakpoints after the two for loops that populate `inputs` and `results` and compare them with the model input and output data you saw in Step 2 to see if they match. For the same inputs to the models running on iOS and Python, you should get the same outputs. - -All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our iOS app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project. - -5. Complete the UI, refactor, build and run the app -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4 in `TorchModule.mm` - remember to first remove the line `return nil;` temporarily put there to make the code build and run: - -.. code-block:: objective-c - - // see the 20 semantic classes link in Introduction - const int DOG = 12; - const int PERSON = 15; - const int SHEEP = 17; - - NSMutableData* data = [NSMutableData dataWithLength: - sizeof(unsigned char) * 3 * WIDTH * HEIGHT]; - unsigned char* buffer = (unsigned char*)[data mutableBytes]; - // go through each element in the output of size [WIDTH, HEIGHT] and - // set different color for different classnum - for (int j = 0; j < WIDTH; j++) { - for (int k = 0; k < HEIGHT; k++) { - // maxi: the index of the 21 CLASSNUM with the max probability - int maxi = 0, maxj = 0, maxk = 0; - float maxnum = -100000.0; - for (int i = 0; i < CLASSNUM; i++) { - if ([results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue] > maxnum) { - maxnum = [results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue]; - maxi = i; maxj = j; maxk = k; - } - } - int n = 3 * (maxj * width + maxk); - // color coding for person (red), dog (green), sheep (blue) - // black color for background and other classes - buffer[n] = 0; buffer[n+1] = 0; buffer[n+2] = 0; - if (maxi == PERSON) buffer[n] = 255; - else if (maxi == DOG) buffer[n+1] = 255; - else if (maxi == SHEEP) buffer[n+2] = 255; - } - } - return buffer; - -The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset. - -After the output processing, you will also need to call a helper function to convert the RGB `buffer` to an `UIImage` instance to be shown on `UIImageView`. You can refer to the example code `convertRGBBufferToUIImage` defined in `UIImageHelper.mm` in the code repository. - -The UI for this app is also similar to that for Hello World, except that you do not need the `UITextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repository to run the model inference and to show back the original image after the segmentation result is shown. - -The last step before we can run the app is to connect all the pieces together. Modify the `ViewController.swift` file to use the `predictImage`, which is refactored and changed to `segmentImage` in the repository, and helper functions you built as shown in the example code in the repository in `ViewController.swift`. Connect the buttons to the actions and you should be good to go. - -Now when you run the app on an iOS simulator or an actual iOS device, you will see the following screens: - -.. image:: /_static/img/deeplabv3_ios.png - :width: 300 px -.. image:: /_static/img/deeplabv3_ios2.png - :width: 300 px - - -Recap --------- - -In this tutorial, we described what it takes to convert a pretrained PyTorch DeepLabV3 model for iOS and how to make sure the model can run successfully on iOS. Our focus was to help you understand the process of confirming that a model can indeed run on iOS. The complete code repository is available `here `_. - -More advanced topics such as quantization and using models via transfer learning or of your own on iOS will be covered soon in future demo apps and tutorials. - -Learn More ------------- - -1. `PyTorch Mobile site `_ -2. `DeepLabV3 model `_ -3. `DeepLabV3 paper `_ + diff --git a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py deleted file mode 100644 index 508fa5a057a..00000000000 --- a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py +++ /dev/null @@ -1,875 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Deploying a Seq2Seq Model with TorchScript -================================================== -**Author:** `Matthew Inkawhich `_ -""" - - -###################################################################### -# This tutorial will walk through the process of transitioning a -# sequence-to-sequence model to TorchScript using the TorchScript -# API. The model that we will convert is the chatbot model from the -# `Chatbot tutorial `__. -# You can either treat this tutorial as a “Part 2” to the Chatbot tutorial -# and deploy your own pretrained model, or you can start with this -# document and use a pretrained model that we host. In the latter case, -# you can reference the original Chatbot tutorial for details -# regarding data preprocessing, model theory and definition, and model -# training. -# -# What is TorchScript? -# ---------------------------- -# -# During the research and development phase of a deep learning-based -# project, it is advantageous to interact with an **eager**, imperative -# interface like PyTorch’s. This gives users the ability to write -# familiar, idiomatic Python, allowing for the use of Python data -# structures, control flow operations, print statements, and debugging -# utilities. Although the eager interface is a beneficial tool for -# research and experimentation applications, when it comes time to deploy -# the model in a production environment, having a **graph**-based model -# representation is very beneficial. A deferred graph representation -# allows for optimizations such as out-of-order execution, and the ability -# to target highly optimized hardware architectures. Also, a graph-based -# representation enables framework-agnostic model exportation. PyTorch -# provides mechanisms for incrementally converting eager-mode code into -# TorchScript, a statically analyzable and optimizable subset of Python -# that Torch uses to represent deep learning programs independently from -# the Python runtime. -# -# The API for converting eager-mode PyTorch programs into TorchScript is -# found in the ``torch.jit`` module. This module has two core modalities for -# converting an eager-mode model to a TorchScript graph representation: -# **tracing** and **scripting**. The ``torch.jit.trace`` function takes a -# module or function and a set of example inputs. It then runs the example -# input through the function or module while tracing the computational -# steps that are encountered, and outputs a graph-based function that -# performs the traced operations. **Tracing** is great for straightforward -# modules and functions that do not involve data-dependent control flow, -# such as standard convolutional neural networks. However, if a function -# with data-dependent if statements and loops is traced, only the -# operations called along the execution route taken by the example input -# will be recorded. In other words, the control flow itself is not -# captured. To convert modules and functions containing data-dependent -# control flow, a **scripting** mechanism is provided. The -# ``torch.jit.script`` function/decorator takes a module or function and -# does not requires example inputs. Scripting then explicitly converts -# the module or function code to TorchScript, including all control flows. -# One caveat with using scripting is that it only supports a subset of -# Python, so you might need to rewrite the code to make it compatible -# with the TorchScript syntax. -# -# For all details relating to the supported features, see the `TorchScript -# language reference `__. -# To provide the maximum flexibility, you can also mix tracing and scripting -# modes together to represent your whole program, and these techniques can -# be applied incrementally. -# -# .. figure:: /_static/img/chatbot/pytorch_workflow.png -# :align: center -# :alt: workflow -# - - - -###################################################################### -# Acknowledgments -# ---------------- -# -# This tutorial was inspired by the following sources: -# -# 1) Yuan-Kuei Wu's pytorch-chatbot implementation: -# https://github.com/ywk991112/pytorch-chatbot -# -# 2) Sean Robertson's practical-pytorch seq2seq-translation example: -# https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation -# -# 3) FloydHub's Cornell Movie Corpus preprocessing code: -# https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus -# - - -###################################################################### -# Prepare Environment -# ------------------- -# -# First, we will import the required modules and set some constants. If -# you are planning on using your own model, be sure that the -# ``MAX_LENGTH`` constant is set correctly. As a reminder, this constant -# defines the maximum allowed sentence length during training and the -# maximum length output that the model is capable of producing. -# - -import torch -import torch.nn as nn -import torch.nn.functional as F -import re -import os -import unicodedata -import numpy as np - -device = torch.device("cpu") - - -MAX_LENGTH = 10 # Maximum sentence length - -# Default word tokens -PAD_token = 0 # Used for padding short sentences -SOS_token = 1 # Start-of-sentence token -EOS_token = 2 # End-of-sentence token - - -###################################################################### -# Model Overview -# -------------- -# -# As mentioned, the model that we are using is a -# `sequence-to-sequence `__ (seq2seq) -# model. This type of model is used in cases when our input is a -# variable-length sequence, and our output is also a variable length -# sequence that is not necessarily a one-to-one mapping of the input. A -# seq2seq model is comprised of two recurrent neural networks (RNNs) that -# work cooperatively: an **encoder** and a **decoder**. -# -# .. figure:: /_static/img/chatbot/seq2seq_ts.png -# :align: center -# :alt: model -# -# -# Image source: -# https://jeddy92.github.io/JEddy92.github.io/ts_seq2seq_intro/ -# -# Encoder -# ~~~~~~~ -# -# The encoder RNN iterates through the input sentence one token -# (e.g. word) at a time, at each time step outputting an “output” vector -# and a “hidden state” vector. The hidden state vector is then passed to -# the next time step, while the output vector is recorded. The encoder -# transforms the context it saw at each point in the sequence into a set -# of points in a high-dimensional space, which the decoder will use to -# generate a meaningful output for the given task. -# -# Decoder -# ~~~~~~~ -# -# The decoder RNN generates the response sentence in a token-by-token -# fashion. It uses the encoder’s context vectors, and internal hidden -# states to generate the next word in the sequence. It continues -# generating words until it outputs an *EOS_token*, representing the end -# of the sentence. We use an `attention -# mechanism `__ in our decoder to help it -# to “pay attention” to certain parts of the input when generating the -# output. For our model, we implement `Luong et -# al. `__\ ’s “Global attention” module, -# and use it as a submodule in our decode model. -# - - -###################################################################### -# Data Handling -# ------------- -# -# Although our models conceptually deal with sequences of tokens, in -# reality, they deal with numbers like all machine learning models do. In -# this case, every word in the model’s vocabulary, which was established -# before training, is mapped to an integer index. We use a ``Voc`` object -# to contain the mappings from word to index, as well as the total number -# of words in the vocabulary. We will load the object later before we run -# the model. -# -# Also, in order for us to be able to run evaluations, we must provide a -# tool for processing our string inputs. The ``normalizeString`` function -# converts all characters in a string to lowercase and removes all -# non-letter characters. The ``indexesFromSentence`` function takes a -# sentence of words and returns the corresponding sequence of word -# indexes. -# - -class Voc: - def __init__(self, name): - self.name = name - self.trimmed = False - self.word2index = {} - self.word2count = {} - self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"} - self.num_words = 3 # Count SOS, EOS, PAD - - def addSentence(self, sentence): - for word in sentence.split(' '): - self.addWord(word) - - def addWord(self, word): - if word not in self.word2index: - self.word2index[word] = self.num_words - self.word2count[word] = 1 - self.index2word[self.num_words] = word - self.num_words += 1 - else: - self.word2count[word] += 1 - - # Remove words below a certain count threshold - def trim(self, min_count): - if self.trimmed: - return - self.trimmed = True - keep_words = [] - for k, v in self.word2count.items(): - if v >= min_count: - keep_words.append(k) - - print('keep_words {} / {} = {:.4f}'.format( - len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index) - )) - # Reinitialize dictionaries - self.word2index = {} - self.word2count = {} - self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"} - self.num_words = 3 # Count default tokens - for word in keep_words: - self.addWord(word) - - -# Lowercase and remove non-letter characters -def normalizeString(s): - s = s.lower() - s = re.sub(r"([.!?])", r" \1", s) - s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) - return s - - -# Takes string sentence, returns sentence of word indexes -def indexesFromSentence(voc, sentence): - return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token] - - -###################################################################### -# Define Encoder -# -------------- -# -# We implement our encoder’s RNN with the ``torch.nn.GRU`` module which we -# feed a batch of sentences (vectors of word embeddings) and it internally -# iterates through the sentences one token at a time calculating the -# hidden states. We initialize this module to be bidirectional, meaning -# that we have two independent GRUs: one that iterates through the -# sequences in chronological order, and another that iterates in reverse -# order. We ultimately return the sum of these two GRUs’ outputs. Since -# our model was trained using batching, our ``EncoderRNN`` model’s -# ``forward`` function expects a padded input batch. To batch -# variable-length sentences, we allow a maximum of *MAX_LENGTH* tokens in -# a sentence, and all sentences in the batch that have less than -# *MAX_LENGTH* tokens are padded at the end with our dedicated *PAD_token* -# tokens. To use padded batches with a PyTorch RNN module, we must wrap -# the forward pass call with ``torch.nn.utils.rnn.pack_padded_sequence`` -# and ``torch.nn.utils.rnn.pad_packed_sequence`` data transformations. -# Note that the ``forward`` function also takes an ``input_lengths`` list, -# which contains the length of each sentence in the batch. This input is -# used by the ``torch.nn.utils.rnn.pack_padded_sequence`` function when -# padding. -# -# TorchScript Notes: -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# Since the encoder’s ``forward`` function does not contain any -# data-dependent control flow, we will use **tracing** to convert it to -# script mode. When tracing a module, we can leave the module definition -# as-is. We will initialize all models towards the end of this document -# before we run evaluations. -# - -class EncoderRNN(nn.Module): - def __init__(self, hidden_size, embedding, n_layers=1, dropout=0): - super(EncoderRNN, self).__init__() - self.n_layers = n_layers - self.hidden_size = hidden_size - self.embedding = embedding - - # Initialize GRU; the ``input_size`` and ``hidden_size`` parameters are both set to 'hidden_size' - # because our input size is a word embedding with number of features == hidden_size - self.gru = nn.GRU(hidden_size, hidden_size, n_layers, - dropout=(0 if n_layers == 1 else dropout), bidirectional=True) - - def forward(self, input_seq, input_lengths, hidden=None): - # type: (Tensor, Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor] - # Convert word indexes to embeddings - embedded = self.embedding(input_seq) - # Pack padded batch of sequences for RNN module - packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) - # Forward pass through GRU - outputs, hidden = self.gru(packed, hidden) - # Unpack padding - outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) - # Sum bidirectional GRU outputs - outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] - # Return output and final hidden state - return outputs, hidden - - -###################################################################### -# Define Decoder’s Attention Module -# --------------------------------- -# -# Next, we’ll define our attention module (``Attn``). Note that this -# module will be used as a submodule in our decoder model. Luong et -# al. consider various “score functions”, which take the current decoder -# RNN output and the entire encoder output, and return attention -# “energies”. This attention energies tensor is the same size as the -# encoder output, and the two are ultimately multiplied, resulting in a -# weighted tensor whose largest values represent the most important parts -# of the query sentence at a particular time-step of decoding. -# - -# Luong attention layer -class Attn(nn.Module): - def __init__(self, method, hidden_size): - super(Attn, self).__init__() - self.method = method - if self.method not in ['dot', 'general', 'concat']: - raise ValueError(self.method, "is not an appropriate attention method.") - self.hidden_size = hidden_size - if self.method == 'general': - self.attn = nn.Linear(self.hidden_size, hidden_size) - elif self.method == 'concat': - self.attn = nn.Linear(self.hidden_size * 2, hidden_size) - self.v = nn.Parameter(torch.FloatTensor(hidden_size)) - - def dot_score(self, hidden, encoder_output): - return torch.sum(hidden * encoder_output, dim=2) - - def general_score(self, hidden, encoder_output): - energy = self.attn(encoder_output) - return torch.sum(hidden * energy, dim=2) - - def concat_score(self, hidden, encoder_output): - energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh() - return torch.sum(self.v * energy, dim=2) - - def forward(self, hidden, encoder_outputs): - # Calculate the attention weights (energies) based on the given method - if self.method == 'general': - attn_energies = self.general_score(hidden, encoder_outputs) - elif self.method == 'concat': - attn_energies = self.concat_score(hidden, encoder_outputs) - elif self.method == 'dot': - attn_energies = self.dot_score(hidden, encoder_outputs) - - # Transpose max_length and batch_size dimensions - attn_energies = attn_energies.t() - - # Return the softmax normalized probability scores (with added dimension) - return F.softmax(attn_energies, dim=1).unsqueeze(1) - - -###################################################################### -# Define Decoder -# -------------- -# -# Similarly to the ``EncoderRNN``, we use the ``torch.nn.GRU`` module for -# our decoder’s RNN. This time, however, we use a unidirectional GRU. It -# is important to note that unlike the encoder, we will feed the decoder -# RNN one word at a time. We start by getting the embedding of the current -# word and applying a -# `dropout `__. -# Next, we forward the embedding and the last hidden state to the GRU and -# obtain a current GRU output and hidden state. We then use our ``Attn`` -# module as a layer to obtain the attention weights, which we multiply by -# the encoder’s output to obtain our attended encoder output. We use this -# attended encoder output as our ``context`` tensor, which represents a -# weighted sum indicating what parts of the encoder’s output to pay -# attention to. From here, we use a linear layer and softmax normalization -# to select the next word in the output sequence. - -# TorchScript Notes: -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# Similarly to the ``EncoderRNN``, this module does not contain any -# data-dependent control flow. Therefore, we can once again use -# **tracing** to convert this model to TorchScript after it -# is initialized and its parameters are loaded. -# - -class LuongAttnDecoderRNN(nn.Module): - def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1): - super(LuongAttnDecoderRNN, self).__init__() - - # Keep for reference - self.attn_model = attn_model - self.hidden_size = hidden_size - self.output_size = output_size - self.n_layers = n_layers - self.dropout = dropout - - # Define layers - self.embedding = embedding - self.embedding_dropout = nn.Dropout(dropout) - self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout)) - self.concat = nn.Linear(hidden_size * 2, hidden_size) - self.out = nn.Linear(hidden_size, output_size) - - self.attn = Attn(attn_model, hidden_size) - - def forward(self, input_step, last_hidden, encoder_outputs): - # Note: we run this one step (word) at a time - # Get embedding of current input word - embedded = self.embedding(input_step) - embedded = self.embedding_dropout(embedded) - # Forward through unidirectional GRU - rnn_output, hidden = self.gru(embedded, last_hidden) - # Calculate attention weights from the current GRU output - attn_weights = self.attn(rnn_output, encoder_outputs) - # Multiply attention weights to encoder outputs to get new "weighted sum" context vector - context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) - # Concatenate weighted context vector and GRU output using Luong eq. 5 - rnn_output = rnn_output.squeeze(0) - context = context.squeeze(1) - concat_input = torch.cat((rnn_output, context), 1) - concat_output = torch.tanh(self.concat(concat_input)) - # Predict next word using Luong eq. 6 - output = self.out(concat_output) - output = F.softmax(output, dim=1) - # Return output and final hidden state - return output, hidden - - -###################################################################### -# Define Evaluation -# ----------------- -# -# Greedy Search Decoder -# ~~~~~~~~~~~~~~~~~~~~~ -# -# As in the chatbot tutorial, we use a ``GreedySearchDecoder`` module to -# facilitate the actual decoding process. This module has the trained -# encoder and decoder models as attributes, and drives the process of -# encoding an input sentence (a vector of word indexes), and iteratively -# decoding an output response sequence one word (word index) at a time. -# -# Encoding the input sequence is straightforward: simply forward the -# entire sequence tensor and its corresponding lengths vector to the -# ``encoder``. It is important to note that this module only deals with -# one input sequence at a time, **NOT** batches of sequences. Therefore, -# when the constant **1** is used for declaring tensor sizes, this -# corresponds to a batch size of 1. To decode a given decoder output, we -# must iteratively run forward passes through our decoder model, which -# outputs softmax scores corresponding to the probability of each word -# being the correct next word in the decoded sequence. We initialize the -# ``decoder_input`` to a tensor containing an *SOS_token*. After each pass -# through the ``decoder``, we *greedily* append the word with the highest -# softmax probability to the ``decoded_words`` list. We also use this word -# as the ``decoder_input`` for the next iteration. The decoding process -# terminates either if the ``decoded_words`` list has reached a length of -# *MAX_LENGTH* or if the predicted word is the *EOS_token*. -# -# TorchScript Notes: -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# The ``forward`` method of this module involves iterating over the range -# of :math:`[0, max\_length)` when decoding an output sequence one word at -# a time. Because of this, we should use **scripting** to convert this -# module to TorchScript. Unlike with our encoder and decoder models, -# which we can trace, we must make some necessary changes to the -# ``GreedySearchDecoder`` module in order to initialize an object without -# error. In other words, we must ensure that our module adheres to the -# rules of the TorchScript mechanism, and does not utilize any language -# features outside of the subset of Python that TorchScript includes. -# -# To get an idea of some manipulations that may be required, we will go -# over the diffs between the ``GreedySearchDecoder`` implementation from -# the chatbot tutorial and the implementation that we use in the cell -# below. Note that the lines highlighted in red are lines removed from the -# original implementation and the lines highlighted in green are new. -# -# .. figure:: /_static/img/chatbot/diff.png -# :align: center -# :alt: diff -# -# Changes: -# ^^^^^^^^ -# -# - Added ``decoder_n_layers`` to the constructor arguments -# -# - This change stems from the fact that the encoder and decoder -# models that we pass to this module will be a child of -# ``TracedModule`` (not ``Module``). Therefore, we cannot access the -# decoder’s number of layers with ``decoder.n_layers``. Instead, we -# plan for this, and pass this value in during module construction. -# -# -# - Store away new attributes as constants -# -# - In the original implementation, we were free to use variables from -# the surrounding (global) scope in our ``GreedySearchDecoder``\ ’s -# ``forward`` method. However, now that we are using scripting, we -# do not have this freedom, as the assumption with scripting is that -# we cannot necessarily hold on to Python objects, especially when -# exporting. An easy solution to this is to store these values from -# the global scope as attributes to the module in the constructor, -# and add them to a special list called ``__constants__`` so that -# they can be used as literal values when constructing the graph in -# the ``forward`` method. An example of this usage is on NEW line -# 19, where instead of using the ``device`` and ``SOS_token`` global -# values, we use our constant attributes ``self._device`` and -# ``self._SOS_token``. -# -# -# - Enforce types of ``forward`` method arguments -# -# - By default, all parameters to a TorchScript function are assumed -# to be Tensor. If we need to pass an argument of a different type, -# we can use function type annotations as introduced in `PEP -# 3107 `__. In addition, -# it is possible to declare arguments of different types using -# Mypy-style type annotations (see -# `doc `__). -# -# -# - Change initialization of ``decoder_input`` -# -# - In the original implementation, we initialized our -# ``decoder_input`` tensor with ``torch.LongTensor([[SOS_token]])``. -# When scripting, we are not allowed to initialize tensors in a -# literal fashion like this. Instead, we can initialize our tensor -# with an explicit torch function such as ``torch.ones``. In this -# case, we can easily replicate the scalar ``decoder_input`` tensor -# by multiplying 1 by our SOS_token value stored in the constant -# ``self._SOS_token``. -# - -class GreedySearchDecoder(nn.Module): - def __init__(self, encoder, decoder, decoder_n_layers): - super(GreedySearchDecoder, self).__init__() - self.encoder = encoder - self.decoder = decoder - self._device = device - self._SOS_token = SOS_token - self._decoder_n_layers = decoder_n_layers - - __constants__ = ['_device', '_SOS_token', '_decoder_n_layers'] - - def forward(self, input_seq : torch.Tensor, input_length : torch.Tensor, max_length : int): - # Forward input through encoder model - encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length) - # Prepare encoder's final hidden layer to be first hidden input to the decoder - decoder_hidden = encoder_hidden[:self._decoder_n_layers] - # Initialize decoder input with SOS_token - decoder_input = torch.ones(1, 1, device=self._device, dtype=torch.long) * self._SOS_token - # Initialize tensors to append decoded words to - all_tokens = torch.zeros([0], device=self._device, dtype=torch.long) - all_scores = torch.zeros([0], device=self._device) - # Iteratively decode one word token at a time - for _ in range(max_length): - # Forward pass through decoder - decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs) - # Obtain most likely word token and its softmax score - decoder_scores, decoder_input = torch.max(decoder_output, dim=1) - # Record token and score - all_tokens = torch.cat((all_tokens, decoder_input), dim=0) - all_scores = torch.cat((all_scores, decoder_scores), dim=0) - # Prepare current token to be next decoder input (add a dimension) - decoder_input = torch.unsqueeze(decoder_input, 0) - # Return collections of word tokens and scores - return all_tokens, all_scores - - - -###################################################################### -# Evaluating an Input -# ~~~~~~~~~~~~~~~~~~~ -# -# Next, we define some functions for evaluating an input. The ``evaluate`` -# function takes a normalized string sentence, processes it to a tensor of -# its corresponding word indexes (with batch size of 1), and passes this -# tensor to a ``GreedySearchDecoder`` instance called ``searcher`` to -# handle the encoding/decoding process. The searcher returns the output -# word index vector and a scores tensor corresponding to the softmax -# scores for each decoded word token. The final step is to convert each -# word index back to its string representation using ``voc.index2word``. -# -# We also define two functions for evaluating an input sentence. The -# ``evaluateInput`` function prompts a user for an input, and evaluates -# it. It will continue to ask for another input until the user enters ‘q’ -# or ‘quit’. -# -# The ``evaluateExample`` function simply takes a string input sentence as -# an argument, normalizes it, evaluates it, and prints the response. -# - -def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH): - ### Format input sentence as a batch - # words -> indexes - indexes_batch = [indexesFromSentence(voc, sentence)] - # Create lengths tensor - lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) - # Transpose dimensions of batch to match models' expectations - input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) - # Use appropriate device - input_batch = input_batch.to(device) - lengths = lengths.to(device) - # Decode sentence with searcher - tokens, scores = searcher(input_batch, lengths, max_length) - # indexes -> words - decoded_words = [voc.index2word[token.item()] for token in tokens] - return decoded_words - - -# Evaluate inputs from user input (``stdin``) -def evaluateInput(searcher, voc): - input_sentence = '' - while(1): - try: - # Get input sentence - input_sentence = input('> ') - # Check if it is quit case - if input_sentence == 'q' or input_sentence == 'quit': break - # Normalize sentence - input_sentence = normalizeString(input_sentence) - # Evaluate sentence - output_words = evaluate(searcher, voc, input_sentence) - # Format and print response sentence - output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')] - print('Bot:', ' '.join(output_words)) - - except KeyError: - print("Error: Encountered unknown word.") - -# Normalize input sentence and call ``evaluate()`` -def evaluateExample(sentence, searcher, voc): - print("> " + sentence) - # Normalize sentence - input_sentence = normalizeString(sentence) - # Evaluate sentence - output_words = evaluate(searcher, voc, input_sentence) - output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')] - print('Bot:', ' '.join(output_words)) - - -###################################################################### -# Load Pretrained Parameters -# -------------------------- -# -# No, let's load our model! -# -# Use hosted model -# ~~~~~~~~~~~~~~~~ -# -# To load the hosted model: -# -# 1) Download the model `here `__. -# -# 2) Set the ``loadFilename`` variable to the path to the downloaded -# checkpoint file. -# -# 3) Leave the ``checkpoint = torch.load(loadFilename)`` line uncommented, -# as the hosted model was trained on CPU. -# -# Use your own model -# ~~~~~~~~~~~~~~~~~~ -# -# To load your own pretrained model: -# -# 1) Set the ``loadFilename`` variable to the path to the checkpoint file -# that you wish to load. Note that if you followed the convention for -# saving the model from the chatbot tutorial, this may involve changing -# the ``model_name``, ``encoder_n_layers``, ``decoder_n_layers``, -# ``hidden_size``, and ``checkpoint_iter`` (as these values are used in -# the model path). -# -# 2) If you trained the model on a CPU, make sure that you are opening the -# checkpoint with the ``checkpoint = torch.load(loadFilename)`` line. -# If you trained the model on a GPU and are running this tutorial on a -# CPU, uncomment the -# ``checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))`` -# line. -# -# TorchScript Notes: -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# Notice that we initialize and load parameters into our encoder and -# decoder models as usual. If you are using tracing mode(``torch.jit.trace``) -# for some part of your models, you must call ``.to(device)`` to set the device -# options of the models and ``.eval()`` to set the dropout layers to test mode -# **before** tracing the models. `TracedModule` objects do not inherit the -# ``to`` or ``eval`` methods. Since in this tutorial we are only using -# scripting instead of tracing, we only need to do this before we do -# evaluation (which is the same as we normally do in eager mode). -# - -save_dir = os.path.join("data", "save") -corpus_name = "cornell movie-dialogs corpus" - -# Configure models -model_name = 'cb_model' -attn_model = 'dot' -#attn_model = 'general'`` -#attn_model = 'concat' -hidden_size = 500 -encoder_n_layers = 2 -decoder_n_layers = 2 -dropout = 0.1 -batch_size = 64 - -# If you're loading your own model -# Set checkpoint to load from -checkpoint_iter = 4000 - -############################################################# -# Sample code to load from a checkpoint: -# -# .. code-block:: python -# -# loadFilename = os.path.join(save_dir, model_name, corpus_name, -# '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size), -# '{}_checkpoint.tar'.format(checkpoint_iter)) - -# If you're loading the hosted model -loadFilename = 'data/4000_checkpoint.tar' - -# Load model -# Force CPU device options (to match tensors in this tutorial) -checkpoint = torch.load(loadFilename, map_location=torch.device('cpu')) -encoder_sd = checkpoint['en'] -decoder_sd = checkpoint['de'] -encoder_optimizer_sd = checkpoint['en_opt'] -decoder_optimizer_sd = checkpoint['de_opt'] -embedding_sd = checkpoint['embedding'] -voc = Voc(corpus_name) -voc.__dict__ = checkpoint['voc_dict'] - - -print('Building encoder and decoder ...') -# Initialize word embeddings -embedding = nn.Embedding(voc.num_words, hidden_size) -embedding.load_state_dict(embedding_sd) -# Initialize encoder & decoder models -encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) -decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) -# Load trained model parameters -encoder.load_state_dict(encoder_sd) -decoder.load_state_dict(decoder_sd) -# Use appropriate device -encoder = encoder.to(device) -decoder = decoder.to(device) -# Set dropout layers to ``eval`` mode -encoder.eval() -decoder.eval() -print('Models built and ready to go!') - - -###################################################################### -# Convert Model to TorchScript -# ----------------------------- -# -# Encoder -# ~~~~~~~ -# -# As previously mentioned, to convert the encoder model to TorchScript, -# we use **scripting**. The encoder model takes an input sequence and -# a corresponding lengths tensor. Therefore, we create an example input -# sequence tensor ``test_seq``, which is of appropriate size (MAX_LENGTH, -# 1), contains numbers in the appropriate range -# :math:`[0, voc.num\_words)`, and is of the appropriate type (int64). We -# also create a ``test_seq_length`` scalar which realistically contains -# the value corresponding to how many words are in the ``test_seq``. The -# next step is to use the ``torch.jit.trace`` function to trace the model. -# Notice that the first argument we pass is the module that we want to -# trace, and the second is a tuple of arguments to the module’s -# ``forward`` method. -# -# Decoder -# ~~~~~~~ -# -# We perform the same process for tracing the decoder as we did for the -# encoder. Notice that we call forward on a set of random inputs to the -# traced_encoder to get the output that we need for the decoder. This is -# not required, as we could also simply manufacture a tensor of the -# correct shape, type, and value range. This method is possible because in -# our case we do not have any constraints on the values of the tensors -# because we do not have any operations that could fault on out-of-range -# inputs. -# -# GreedySearchDecoder -# ~~~~~~~~~~~~~~~~~~~ -# -# Recall that we scripted our searcher module due to the presence of -# data-dependent control flow. In the case of scripting, we do necessary -# language changes to make sure the implementation complies with -# TorchScript. We initialize the scripted searcher the same way that we -# would initialize an unscripted variant. -# - -### Compile the whole greedy search model to TorchScript model -# Create artificial inputs -test_seq = torch.LongTensor(MAX_LENGTH, 1).random_(0, voc.num_words).to(device) -test_seq_length = torch.LongTensor([test_seq.size()[0]]).to(device) -# Trace the model -traced_encoder = torch.jit.trace(encoder, (test_seq, test_seq_length)) - -### Convert decoder model -# Create and generate artificial inputs -test_encoder_outputs, test_encoder_hidden = traced_encoder(test_seq, test_seq_length) -test_decoder_hidden = test_encoder_hidden[:decoder.n_layers] -test_decoder_input = torch.LongTensor(1, 1).random_(0, voc.num_words) -# Trace the model -traced_decoder = torch.jit.trace(decoder, (test_decoder_input, test_decoder_hidden, test_encoder_outputs)) - -### Initialize searcher module by wrapping ``torch.jit.script`` call -scripted_searcher = torch.jit.script(GreedySearchDecoder(traced_encoder, traced_decoder, decoder.n_layers)) - - - - -###################################################################### -# Print Graphs -# ------------ -# -# Now that our models are in TorchScript form, we can print the graphs of -# each to ensure that we captured the computational graph appropriately. -# Since TorchScript allow us to recursively compile the whole model -# hierarchy and inline the ``encoder`` and ``decoder`` graph into a single -# graph, we just need to print the `scripted_searcher` graph - -print('scripted_searcher graph:\n', scripted_searcher.graph) - - -###################################################################### -# Run Evaluation -# -------------- -# -# Finally, we will run evaluation of the chatbot model using the TorchScript -# models. If converted correctly, the models will behave exactly as they -# would in their eager-mode representation. -# -# By default, we evaluate a few common query sentences. If you want to -# chat with the bot yourself, uncomment the ``evaluateInput`` line and -# give it a spin. -# - - -# Use appropriate device -scripted_searcher.to(device) -# Set dropout layers to ``eval`` mode -scripted_searcher.eval() - -# Evaluate examples -sentences = ["hello", "what's up?", "who are you?", "where am I?", "where are you from?"] -for s in sentences: - evaluateExample(s, scripted_searcher, voc) - -# Evaluate your input by running -# ``evaluateInput(traced_encoder, traced_decoder, scripted_searcher, voc)`` - - -###################################################################### -# Save Model -# ---------- -# -# Now that we have successfully converted our model to TorchScript, we -# will serialize it for use in a non-Python deployment environment. To do -# this, we can simply save our ``scripted_searcher`` module, as this is -# the user-facing interface for running inference against the chatbot -# model. When saving a Script module, use script_module.save(PATH) instead -# of torch.save(model, PATH). -# - -scripted_searcher.save("scripted_chatbot.pth") diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst index 502961e20c3..9088434bf2f 100644 --- a/beginner_source/dist_overview.rst +++ b/beginner_source/dist_overview.rst @@ -1,6 +1,6 @@ PyTorch Distributed Overview ============================ -**Author**: `Will Constable `_ +**Author**: `Will Constable `_, `Wei Feng `_ .. note:: |edit| View and edit this tutorial in `github `__. @@ -26,7 +26,7 @@ Parallelism APIs These Parallelism Modules offer high-level functionality and compose with existing models: - `Distributed Data-Parallel (DDP) `__ -- `Fully Sharded Data-Parallel Training (FSDP) `__ +- `Fully Sharded Data-Parallel Training (FSDP2) `__ - `Tensor Parallel (TP) `__ - `Pipeline Parallel (PP) `__ @@ -35,7 +35,7 @@ Sharding primitives ``DTensor`` and ``DeviceMesh`` are primitives used to build parallelism in terms of sharded or replicated tensors on N-dimensional process groups. -- `DTensor `__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations. +- `DTensor `__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations. - `DeviceMesh `__ abstracts the accelerator device communicators into a multi-dimensional array, which manages the underlying ``ProcessGroup`` instances for collective communications in multi-dimensional parallelisms. Try out our `Device Mesh Recipe `__ to learn more. Communications APIs @@ -70,15 +70,15 @@ When deciding what parallelism techniques to choose for your model, use these co #. Use `DistributedDataParallel (DDP) `__, if your model fits in a single GPU but you want to easily scale up training using multiple GPUs. - * Use `torchrun `__, to launch multiple pytorch processes if you are you using more than one node. + * Use `torchrun `__, to launch multiple pytorch processes if you are using more than one node. * See also: `Getting Started with Distributed Data Parallel <../intermediate/ddp_tutorial.html>`__ -#. Use `FullyShardedDataParallel (FSDP) `__ when your model cannot fit on one GPU. +#. Use `FullyShardedDataParallel (FSDP2) `__ when your model cannot fit on one GPU. - * See also: `Getting Started with FSDP `__ + * See also: `Getting Started with FSDP2 `__ -#. Use `Tensor Parallel (TP) `__ and/or `Pipeline Parallel (PP) `__ if you reach scaling limitations with FSDP. +#. Use `Tensor Parallel (TP) `__ and/or `Pipeline Parallel (PP) `__ if you reach scaling limitations with FSDP2. * Try our `Tensor Parallelism Tutorial `__ diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py index 9c992d2ca4d..d33ca8bcb90 100755 --- a/beginner_source/examples_autograd/polynomial_autograd.py +++ b/beginner_source/examples_autograd/polynomial_autograd.py @@ -1,5 +1,4 @@ -# -*- coding: utf-8 -*- -""" +r""" PyTorch: Tensors and autograd ------------------------------- @@ -17,15 +16,19 @@ import torch import math +# We want to be able to train our model on an `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU. + dtype = torch.float -device = "cuda" if torch.cuda.is_available() else "cpu" +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") torch.set_default_device(device) # Create Tensors to hold input and outputs. # By default, requires_grad=False, which indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype) -y = torch.sin(x) +x = torch.linspace(-1, 1, 2000, dtype=dtype) +y = torch.exp(x) # A Taylor expansion would be 1 + x + (1/2) x**2 + (1/3!) x**3 + ... # Create random Tensors for weights. For a third order polynomial, we need # 4 weights: y = a + b x + c x^2 + d x^3 @@ -36,8 +39,9 @@ c = torch.randn((), dtype=dtype, requires_grad=True) d = torch.randn((), dtype=dtype, requires_grad=True) -learning_rate = 1e-6 -for t in range(2000): +initial_loss = 1. +learning_rate = 1e-5 +for t in range(5000): # Forward pass: compute predicted y using operations on Tensors. y_pred = a + b * x + c * x ** 2 + d * x ** 3 @@ -45,8 +49,13 @@ # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = (y_pred - y).pow(2).sum() + + # Calculare initial loss, so we can report loss relative to it + if t==0: + initial_loss=loss.item() + if t % 100 == 99: - print(t, loss.item()) + print(f'Iteration t = {t:4d} loss(t)/loss(0) = {round(loss.item()/initial_loss, 6):10.6f} a = {a.item():10.6f} b = {b.item():10.6f} c = {c.item():10.6f} d = {d.item():10.6f}') # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. diff --git a/beginner_source/examples_autograd/polynomial_custom_function.py b/beginner_source/examples_autograd/polynomial_custom_function.py index 130775ea985..39057c8fd7a 100755 --- a/beginner_source/examples_autograd/polynomial_custom_function.py +++ b/beginner_source/examples_autograd/polynomial_custom_function.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ PyTorch: Defining New autograd Functions ---------------------------------------- @@ -34,8 +33,11 @@ def forward(ctx, input): """ In the forward pass we receive a Tensor containing the input and return a Tensor containing the output. ctx is a context object that can be used - to stash information for backward computation. You can cache arbitrary - objects for use in the backward pass using the ctx.save_for_backward method. + to stash information for backward computation. You can cache tensors for + use in the backward pass using the ``ctx.save_for_backward`` method. Other + objects can be stored directly as attributes on the ctx object, such as + ``ctx.my_object = my_object``. Check out `Extending torch.autograd `_ + for further details. """ ctx.save_for_backward(input) return 0.5 * (5 * input ** 3 - 3 * input) diff --git a/beginner_source/examples_nn/polynomial_nn.py b/beginner_source/examples_nn/polynomial_nn.py index ad6ba22f193..70e281ed365 100755 --- a/beginner_source/examples_nn/polynomial_nn.py +++ b/beginner_source/examples_nn/polynomial_nn.py @@ -4,7 +4,7 @@ ----------- A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` -to :math:`pi` by minimizing squared Euclidean distance. +to :math:`\pi` by minimizing squared Euclidean distance. This implementation uses the nn package from PyTorch to build the network. PyTorch autograd makes it easy to define computational graphs and take gradients, diff --git a/beginner_source/examples_nn/polynomial_optim.py b/beginner_source/examples_nn/polynomial_optim.py index 434fb6624b3..c0d4896c8f2 100755 --- a/beginner_source/examples_nn/polynomial_optim.py +++ b/beginner_source/examples_nn/polynomial_optim.py @@ -4,7 +4,7 @@ -------------- A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` -to :math:`pi` by minimizing squared Euclidean distance. +to :math:`\pi` by minimizing squared Euclidean distance. This implementation uses the nn package from PyTorch to build the network. diff --git a/beginner_source/examples_tensor/polynomial_numpy.py b/beginner_source/examples_tensor/polynomial_numpy.py index a1a378e50ed..059ec286ee4 100755 --- a/beginner_source/examples_tensor/polynomial_numpy.py +++ b/beginner_source/examples_tensor/polynomial_numpy.py @@ -4,7 +4,7 @@ -------------- A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` -to :math:`pi` by minimizing squared Euclidean distance. +to :math:`\pi` by minimizing squared Euclidean distance. This implementation uses numpy to manually compute the forward pass, loss, and backward pass. diff --git a/beginner_source/examples_tensor/polynomial_tensor.py b/beginner_source/examples_tensor/polynomial_tensor.py index 1e35b0f24bd..260cf8d2849 100755 --- a/beginner_source/examples_tensor/polynomial_tensor.py +++ b/beginner_source/examples_tensor/polynomial_tensor.py @@ -4,7 +4,7 @@ ---------------- A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` -to :math:`pi` by minimizing squared Euclidean distance. +to :math:`\pi` by minimizing squared Euclidean distance. This implementation uses PyTorch tensors to manually compute the forward pass, loss, and backward pass. diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py index 007ad3fd956..a991fe85627 100644 --- a/beginner_source/fgsm_tutorial.py +++ b/beginner_source/fgsm_tutorial.py @@ -125,14 +125,9 @@ # `pytorch/examples/mnist `__. # For simplicity, download the pretrained model `here `__. # -# - ``use_cuda`` - boolean flag to use CUDA if desired and available. -# Note, a GPU with CUDA is not critical for this tutorial as a CPU will -# not take much time. -# epsilons = [0, .05, .1, .15, .2, .25, .3] pretrained_model = "data/lenet_mnist_model.pth" -use_cuda=True # Set random seed for reproducibility torch.manual_seed(42) @@ -184,15 +179,16 @@ def forward(self, x): ])), batch_size=1, shuffle=True) -# Define what device we are using -print("CUDA Available: ",torch.cuda.is_available()) -device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu") +# We want to be able to train our model on an `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU. +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") # Initialize the network model = Net().to(device) # Load the pretrained model -model.load_state_dict(torch.load(pretrained_model, map_location=device)) +model.load_state_dict(torch.load(pretrained_model, map_location=device, weights_only=True)) # Set the model in evaluation mode. In this case this is for the Dropout layers model.eval() diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py deleted file mode 100644 index 12e20f475f8..00000000000 --- a/beginner_source/flava_finetuning_tutorial.py +++ /dev/null @@ -1,190 +0,0 @@ -# -*- coding: utf-8 -*- -""" -TorchMultimodal Tutorial: Finetuning FLAVA -============================================ -""" - -###################################################################### -# Multimodal AI has recently become very popular owing to its ubiquitous -# nature, from use cases like image captioning and visual search to more -# recent applications like image generation from text. **TorchMultimodal -# is a library powered by Pytorch consisting of building blocks and end to -# end examples, aiming to enable and accelerate research in -# multimodality**. -# -# In this tutorial, we will demonstrate how to use a **pretrained SoTA -# model called** `FLAVA `__ **from -# TorchMultimodal library to finetune on a multimodal task i.e. visual -# question answering** (VQA). The model consists of two unimodal transformer -# based encoders for text and image and a multimodal encoder to combine -# the two embeddings. It is pretrained using contrastive, image text matching and -# text, image and multimodal masking losses. - - -###################################################################### -# Installation -# ----------------- -# We will use TextVQA dataset and ``bert tokenizer`` from Hugging Face for this -# tutorial. So you need to install datasets and transformers in addition to TorchMultimodal. -# -# .. note:: -# -# When running this tutorial in Google Colab, install the required packages by -# creating a new cell and running the following commands: -# -# .. code-block:: -# -# !pip install torchmultimodal-nightly -# !pip install datasets -# !pip install transformers -# - -###################################################################### -# Steps -# ----- -# -# 1. Download the Hugging Face dataset to a directory on your computer by running the following command: -# -# .. code-block:: -# -# wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -# tar xf vocab.tar.gz -# -# .. note:: -# If you are running this tutorial in Google Colab, run these commands -# in a new cell and prepend these commands with an exclamation mark (!) -# -# -# 2. For this tutorial, we treat VQA as a classification task where -# the inputs are images and question (text) and the output is an answer class. -# So we need to download the vocab file with answer classes and create the answer to -# label mapping. -# -# We also load the `textvqa -# dataset `__ containing 34602 training samples -# (images,questions and answers) from Hugging Face -# -# We see there are 3997 answer classes including a class representing -# unknown answers. -# - -with open("data/vocabs/answers_textvqa_more_than_1.txt") as f: - vocab = f.readlines() - -answer_to_idx = {} -for idx, entry in enumerate(vocab): - answer_to_idx[entry.strip("\n")] = idx -print(len(vocab)) -print(vocab[:5]) - -from datasets import load_dataset -dataset = load_dataset("textvqa") - -###################################################################### -# Lets display a sample entry from the dataset: -# - -import matplotlib.pyplot as plt -import numpy as np -idx = 5 -print("Question: ", dataset["train"][idx]["question"]) -print("Answers: " ,dataset["train"][idx]["answers"]) -im = np.asarray(dataset["train"][idx]["image"].resize((500,500))) -plt.imshow(im) -plt.show() - - -###################################################################### -# 3. Next, we write the transform function to convert the image and text into -# Tensors consumable by our model - For images, we use the transforms from -# torchvision to convert to Tensor and resize to uniform sizes - For text, -# we tokenize (and pad) them using the ``BertTokenizer`` from Hugging Face - -# For answers (i.e. labels), we take the most frequently occurring answer -# as the label to train with: -# - -import torch -from torchvision import transforms -from collections import defaultdict -from transformers import BertTokenizer -from functools import partial - -def transform(tokenizer, input): - batch = {} - image_transform = transforms.Compose([transforms.ToTensor(), transforms.Resize([224,224])]) - image = image_transform(input["image"][0].convert("RGB")) - batch["image"] = [image] - - tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512) - batch.update(tokenized) - - - ans_to_count = defaultdict(int) - for ans in input["answers"][0]: - ans_to_count[ans] += 1 - max_value = max(ans_to_count, key=ans_to_count.get) - ans_idx = answer_to_idx.get(max_value,0) - batch["answers"] = torch.as_tensor([ans_idx]) - return batch - -tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512) -transform=partial(transform,tokenizer) -dataset.set_transform(transform) - - -###################################################################### -# 4. Finally, we import the ``flava_model_for_classification`` from -# ``torchmultimodal``. It loads the pretrained FLAVA checkpoint by default and -# includes a classification head. -# -# The model forward function passes the image through the visual encoder -# and the question through the text encoder. The image and question -# embeddings are then passed through the multimodal encoder. The final -# embedding corresponding to the CLS token is passed through a MLP head -# which finally gives the probability distribution over each possible -# answers. -# - -from torchmultimodal.models.flava.model import flava_model_for_classification -model = flava_model_for_classification(num_classes=len(vocab)) - - -###################################################################### -# 5. We put together the dataset and model in a toy training loop to -# demonstrate how to train the model for 3 iterations: -# - -from torch import nn -BATCH_SIZE = 2 -MAX_STEPS = 3 -from torch.utils.data import DataLoader - -train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE) -optimizer = torch.optim.AdamW(model.parameters()) - - -epochs = 1 -for _ in range(epochs): - for idx, batch in enumerate(train_dataloader): - optimizer.zero_grad() - out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"]) - loss = out.loss - loss.backward() - optimizer.step() - print(f"Loss at step {idx} = {loss}") - if idx >= MAX_STEPS-1: - break - - -###################################################################### -# Conclusion -# ------------------- -# -# This tutorial introduced the basics around how to finetune on a -# multimodal task using FLAVA from TorchMultimodal. Please also check out -# other examples from the library like -# `MDETR `__ -# which is a multimodal model for object detection and -# `Omnivore `__ -# which is multitask model spanning image, video and 3d classification. -# diff --git a/beginner_source/former_torchies/README.txt b/beginner_source/former_torchies/README.txt deleted file mode 100644 index 5bb8c93f00c..00000000000 --- a/beginner_source/former_torchies/README.txt +++ /dev/null @@ -1,18 +0,0 @@ - PyTorch for former Torch users - ------------------------------ - -1. tensor_tutorial_old.py - Tensors - https://pytorch.org/tutorials/beginner/former_torchies/tensor_tutorial_old.html - -2. autograd_tutorial_old.py - Autograd - https://pytorch.org/tutorials/beginner/former_torchies/autograd_tutorial_old.html - -3. nnft_tutorial.py - nn package - https://pytorch.org/tutorials/beginner/former_torchies/nnft_tutorial.html - -4. parallelism_tutorial.py - Multi-GPU examples - https://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html diff --git a/beginner_source/former_torchies/autograd_tutorial_old.py b/beginner_source/former_torchies/autograd_tutorial_old.py deleted file mode 100644 index 4030831b8ef..00000000000 --- a/beginner_source/former_torchies/autograd_tutorial_old.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Autograd -======== - -Autograd is now a core torch package for automatic differentiation. -It uses a tape based system for automatic differentiation. - -In the forward phase, the autograd tape will remember all the operations -it executed, and in the backward phase, it will replay the operations. - -Tensors that track history --------------------------- - -In autograd, if any input ``Tensor`` of an operation has ``requires_grad=True``, -the computation will be tracked. After computing the backward pass, a gradient -w.r.t. this tensor is accumulated into ``.grad`` attribute. - -There’s one more class which is very important for autograd -implementation - a ``Function``. ``Tensor`` and ``Function`` are -interconnected and build up an acyclic graph, that encodes a complete -history of computation. Each variable has a ``.grad_fn`` attribute that -references a function that has created a function (except for Tensors -created by the user - these have ``None`` as ``.grad_fn``). - -If you want to compute the derivatives, you can call ``.backward()`` on -a ``Tensor``. If ``Tensor`` is a scalar (i.e. it holds a one element -tensor), you don’t need to specify any arguments to ``backward()``, -however if it has more elements, you need to specify a ``grad_output`` -argument that is a tensor of matching shape. -""" - -import torch - -############################################################### -# Create a tensor and set requires_grad=True to track computation with it -x = torch.ones(2, 2, requires_grad=True) -print(x) - -############################################################### -# -print(x.data) - -############################################################### -# -print(x.grad) - -############################################################### -# - -print(x.grad_fn) # we've created x ourselves - -############################################################### -# Do an operation of x: - -y = x + 2 -print(y) - -############################################################### -# y was created as a result of an operation, -# so it has a grad_fn -print(y.grad_fn) - -############################################################### -# More operations on y: - -z = y * y * 3 -out = z.mean() - -print(z, out) - -################################################################ -# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad`` -# flag in-place. The input flag defaults to ``True`` if not given. -a = torch.randn(2, 2) -a = ((a * 3) / (a - 1)) -print(a.requires_grad) -a.requires_grad_(True) -print(a.requires_grad) -b = (a * a).sum() -print(b.grad_fn) - -############################################################### -# Gradients -# --------- -# -# let's backprop now and print gradients d(out)/dx - -out.backward() -print(x.grad) - - -############################################################### -# By default, gradient computation flushes all the internal buffers -# contained in the graph, so if you even want to do the backward on some -# part of the graph twice, you need to pass in ``retain_variables = True`` -# during the first pass. - -x = torch.ones(2, 2, requires_grad=True) -y = x + 2 -y.backward(torch.ones(2, 2), retain_graph=True) -# the retain_variables flag will prevent the internal buffers from being freed -print(x.grad) - -############################################################### -# -z = y * y -print(z) - -############################################################### -# -# just backprop random gradients - -gradient = torch.randn(2, 2) - -# this would fail if we didn't specify -# that we want to retain variables -y.backward(gradient) - -print(x.grad) - -############################################################### -# You can also stop autograd from tracking history on Tensors -# with requires_grad=True by wrapping the code block in -# ``with torch.no_grad():`` -print(x.requires_grad) -print((x ** 2).requires_grad) - -with torch.no_grad(): - print((x ** 2).requires_grad) diff --git a/beginner_source/former_torchies/autograd_tutorial_old.rst b/beginner_source/former_torchies/autograd_tutorial_old.rst new file mode 100644 index 00000000000..8c887e00c8a --- /dev/null +++ b/beginner_source/former_torchies/autograd_tutorial_old.rst @@ -0,0 +1,8 @@ +Autograd +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/nnft_tutorial.py b/beginner_source/former_torchies/nnft_tutorial.py deleted file mode 100644 index 316bf03a985..00000000000 --- a/beginner_source/former_torchies/nnft_tutorial.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -""" -nn package -========== - -We’ve redesigned the nn package, so that it’s fully integrated with -autograd. Let's review the changes. - -**Replace containers with autograd:** - - You no longer have to use Containers like ``ConcatTable``, or modules like - ``CAddTable``, or use and debug with nngraph. We will seamlessly use - autograd to define our neural networks. For example, - - * ``output = nn.CAddTable():forward({input1, input2})`` simply becomes - ``output = input1 + input2`` - * ``output = nn.MulConstant(0.5):forward(input)`` simply becomes - ``output = input * 0.5`` - -**State is no longer held in the module, but in the network graph:** - - Using recurrent networks should be simpler because of this reason. If - you want to create a recurrent network, simply use the same Linear layer - multiple times, without having to think about sharing weights. - - .. figure:: /_static/img/torch-nn-vs-pytorch-nn.png - :alt: torch-nn-vs-pytorch-nn - - torch-nn-vs-pytorch-nn - -**Simplified debugging:** - - Debugging is intuitive using Python’s pdb debugger, and **the debugger - and stack traces stop at exactly where an error occurred.** What you see - is what you get. - -Example 1: ConvNet ------------------- - -Let’s see how to create a small ConvNet. - -All of your networks are derived from the base class ``nn.Module``: - -- In the constructor, you declare all the layers you want to use. -- In the forward function, you define how your model is going to be - run, from input to output -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class MNISTConvNet(nn.Module): - - def __init__(self): - # this is the place where you instantiate all your modules - # you can later access them using the same names you've given them in - # here - super(MNISTConvNet, self).__init__() - self.conv1 = nn.Conv2d(1, 10, 5) - self.pool1 = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(10, 20, 5) - self.pool2 = nn.MaxPool2d(2, 2) - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - # it's the forward function that defines the network structure - # we're accepting only a single input in here, but if you want, - # feel free to use more - def forward(self, input): - x = self.pool1(F.relu(self.conv1(input))) - x = self.pool2(F.relu(self.conv2(x))) - - # in your model definition you can go full crazy and use arbitrary - # python code to define your model structure - # all these are perfectly legal, and will be handled correctly - # by autograd: - # if x.gt(0) > x.numel() / 2: - # ... - # - # you can even do a loop and reuse the same module inside it - # modules no longer hold ephemeral state, so you can use them - # multiple times during your forward pass - # while x.norm(2) < 10: - # x = self.conv1(x) - - x = x.view(x.size(0), -1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return x - -############################################################### -# Let's use the defined ConvNet now. -# You create an instance of the class first. - - -net = MNISTConvNet() -print(net) - -######################################################################## -# .. note:: -# -# ``torch.nn`` only supports mini-batches The entire ``torch.nn`` -# package only supports inputs that are a mini-batch of samples, and not -# a single sample. -# -# For example, ``nn.Conv2d`` will take in a 4D Tensor of -# ``nSamples x nChannels x Height x Width``. -# -# If you have a single sample, just use ``input.unsqueeze(0)`` to add -# a fake batch dimension. -# -# Create a mini-batch containing a single sample of random data and send the -# sample through the ConvNet. - -input = torch.randn(1, 1, 28, 28) -out = net(input) -print(out.size()) - -######################################################################## -# Define a dummy target label and compute error using a loss function. - -target = torch.tensor([3], dtype=torch.long) -loss_fn = nn.CrossEntropyLoss() # LogSoftmax + ClassNLL Loss -err = loss_fn(out, target) -err.backward() - -print(err) - -######################################################################## -# The output of the ConvNet ``out`` is a ``Tensor``. We compute the loss -# using that, and that results in ``err`` which is also a ``Tensor``. -# Calling ``.backward`` on ``err`` hence will propagate gradients all the -# way through the ConvNet to it’s weights -# -# Let's access individual layer weights and gradients: - -print(net.conv1.weight.grad.size()) - -######################################################################## -print(net.conv1.weight.data.norm()) # norm of the weight -print(net.conv1.weight.grad.data.norm()) # norm of the gradients - -######################################################################## -# Forward and Backward Function Hooks -# ----------------------------------- -# -# We’ve inspected the weights and the gradients. But how about inspecting -# / modifying the output and grad\_output of a layer? -# -# We introduce **hooks** for this purpose. -# -# You can register a function on a ``Module`` or a ``Tensor``. -# The hook can be a forward hook or a backward hook. -# The forward hook will be executed when a forward call is executed. -# The backward hook will be executed in the backward phase. -# Let’s look at an example. -# -# We register a forward hook on conv2 and print some information - - -def printnorm(self, input, output): - # input is a tuple of packed inputs - # output is a Tensor. output.data is the Tensor we are interested - print('Inside ' + self.__class__.__name__ + ' forward') - print('') - print('input: ', type(input)) - print('input[0]: ', type(input[0])) - print('output: ', type(output)) - print('') - print('input size:', input[0].size()) - print('output size:', output.data.size()) - print('output norm:', output.data.norm()) - - -net.conv2.register_forward_hook(printnorm) - -out = net(input) - -######################################################################## -# -# We register a backward hook on conv2 and print some information - - -def printgradnorm(self, grad_input, grad_output): - print('Inside ' + self.__class__.__name__ + ' backward') - print('Inside class:' + self.__class__.__name__) - print('') - print('grad_input: ', type(grad_input)) - print('grad_input[0]: ', type(grad_input[0])) - print('grad_output: ', type(grad_output)) - print('grad_output[0]: ', type(grad_output[0])) - print('') - print('grad_input size:', grad_input[0].size()) - print('grad_output size:', grad_output[0].size()) - print('grad_input norm:', grad_input[0].norm()) - - -net.conv2.register_backward_hook(printgradnorm) - -out = net(input) -err = loss_fn(out, target) -err.backward() - -######################################################################## -# A full and working MNIST example is located here -# https://github.com/pytorch/examples/tree/master/mnist -# -# Example 2: Recurrent Net -# ------------------------ -# -# Next, let’s look at building recurrent nets with PyTorch. -# -# Since the state of the network is held in the graph and not in the -# layers, you can simply create an nn.Linear and reuse it over and over -# again for the recurrence. - - -class RNN(nn.Module): - - # you can also accept arguments in your model constructor - def __init__(self, data_size, hidden_size, output_size): - super(RNN, self).__init__() - - self.hidden_size = hidden_size - input_size = data_size + hidden_size - - self.i2h = nn.Linear(input_size, hidden_size) - self.h2o = nn.Linear(hidden_size, output_size) - - def forward(self, data, last_hidden): - input = torch.cat((data, last_hidden), 1) - hidden = self.i2h(input) - output = self.h2o(hidden) - return hidden, output - - -rnn = RNN(50, 20, 10) - -######################################################################## -# -# A more complete Language Modeling example using LSTMs and Penn Tree-bank -# is located -# `here `_ -# -# PyTorch by default has seamless CuDNN integration for ConvNets and -# Recurrent Nets - -loss_fn = nn.MSELoss() - -batch_size = 10 -TIMESTEPS = 5 - -# Create some fake data -batch = torch.randn(batch_size, 50) -hidden = torch.zeros(batch_size, 20) -target = torch.zeros(batch_size, 10) - -loss = 0 -for t in range(TIMESTEPS): - # yes! you can reuse the same network several times, - # sum up the losses, and call backward! - hidden, output = rnn(batch, hidden) - loss += loss_fn(output, target) -loss.backward() diff --git a/beginner_source/former_torchies/nnft_tutorial.rst b/beginner_source/former_torchies/nnft_tutorial.rst new file mode 100644 index 00000000000..db378a7162b --- /dev/null +++ b/beginner_source/former_torchies/nnft_tutorial.rst @@ -0,0 +1,8 @@ +nn Package +=============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/nn_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py deleted file mode 100644 index a11d844e1bd..00000000000 --- a/beginner_source/former_torchies/parallelism_tutorial.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Multi-GPU Examples -================== - -Data Parallelism is when we split the mini-batch of samples into -multiple smaller mini-batches and run the computation for each of the -smaller mini-batches in parallel. - -Data Parallelism is implemented using ``torch.nn.DataParallel``. -One can wrap a Module in ``DataParallel`` and it will be parallelized -over multiple GPUs in the batch dimension. - - -DataParallel -------------- -""" -import torch -import torch.nn as nn - - -class DataParallelModel(nn.Module): - - def __init__(self): - super().__init__() - self.block1 = nn.Linear(10, 20) - - # wrap block2 in DataParallel - self.block2 = nn.Linear(20, 20) - self.block2 = nn.DataParallel(self.block2) - - self.block3 = nn.Linear(20, 20) - - def forward(self, x): - x = self.block1(x) - x = self.block2(x) - x = self.block3(x) - return x - -######################################################################## -# The code does not need to be changed in CPU-mode. -# -# The documentation for DataParallel can be found -# `here `_. -# -# **Attributes of the wrapped module** -# -# After wrapping a Module with ``DataParallel``, the attributes of the module -# (e.g. custom methods) became inaccessible. This is because ``DataParallel`` -# defines a few new members, and allowing other attributes might lead to -# clashes in their names. For those who still want to access the attributes, -# a workaround is to use a subclass of ``DataParallel`` as below. - -class MyDataParallel(nn.DataParallel): - def __getattr__(self, name): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.module, name) - -######################################################################## -# **Primitives on which DataParallel is implemented upon:** -# -# -# In general, pytorch’s `nn.parallel` primitives can be used independently. -# We have implemented simple MPI-like primitives: -# -# - replicate: replicate a Module on multiple devices -# - scatter: distribute the input in the first-dimension -# - gather: gather and concatenate the input in the first-dimension -# - parallel\_apply: apply a set of already-distributed inputs to a set of -# already-distributed models. -# -# To give a better clarity, here function ``data_parallel`` composed using -# these collectives - - -def data_parallel(module, input, device_ids, output_device=None): - if not device_ids: - return module(input) - - if output_device is None: - output_device = device_ids[0] - - replicas = nn.parallel.replicate(module, device_ids) - inputs = nn.parallel.scatter(input, device_ids) - replicas = replicas[:len(inputs)] - outputs = nn.parallel.parallel_apply(replicas, inputs) - return nn.parallel.gather(outputs, output_device) - -######################################################################## -# Part of the model on CPU and part on the GPU -# -------------------------------------------- -# -# Let’s look at a small example of implementing a network where part of it -# is on the CPU and part on the GPU - -device = torch.device("cuda:0") - -class DistributedModel(nn.Module): - - def __init__(self): - super().__init__( - embedding=nn.Embedding(1000, 10), - rnn=nn.Linear(10, 10).to(device), - ) - - def forward(self, x): - # Compute embedding on CPU - x = self.embedding(x) - - # Transfer to GPU - x = x.to(device) - - # Compute RNN on GPU - x = self.rnn(x) - return x - -######################################################################## -# -# This was a small introduction to PyTorch for former Torch users. -# There’s a lot more to learn. -# -# Look at our more comprehensive introductory tutorial which introduces -# the ``optim`` package, data loaders etc.: :doc:`/beginner/deep_learning_60min_blitz`. -# -# Also look at -# -# - :doc:`Train neural nets to play video games ` -# - `Train a state-of-the-art ResNet network on imagenet`_ -# - `Train a face generator using Generative Adversarial Networks`_ -# - `Train a word-level language model using Recurrent LSTM networks`_ -# - `More examples`_ -# - `More tutorials`_ -# - `Discuss PyTorch on the Forums`_ -# - `Chat with other users on Slack`_ -# -# .. _`Deep Learning with PyTorch: a 60-minute blitz`: https://github.com/pytorch/tutorials/blob/main/Deep%20Learning%20with%20PyTorch.ipynb -# .. _Train a state-of-the-art ResNet network on imagenet: https://github.com/pytorch/examples/tree/master/imagenet -# .. _Train a face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan -# .. _Train a word-level language model using Recurrent LSTM networks: https://github.com/pytorch/examples/tree/master/word_language_model -# .. _More examples: https://github.com/pytorch/examples -# .. _More tutorials: https://github.com/pytorch/tutorials -# .. _Discuss PyTorch on the Forums: https://discuss.pytorch.org/ -# .. _Chat with other users on Slack: https://pytorch.slack.com/messages/beginner/ diff --git a/beginner_source/former_torchies/parallelism_tutorial.rst b/beginner_source/former_torchies/parallelism_tutorial.rst new file mode 100644 index 00000000000..04bb1d69e57 --- /dev/null +++ b/beginner_source/former_torchies/parallelism_tutorial.rst @@ -0,0 +1,8 @@ +Multi-GPU Examples +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/tensor_tutorial_old.py b/beginner_source/former_torchies/tensor_tutorial_old.py deleted file mode 100644 index 10a9d81fadb..00000000000 --- a/beginner_source/former_torchies/tensor_tutorial_old.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -Tensors -======= - -Tensors behave almost exactly the same way in PyTorch as they do in -Torch. - -Create a tensor of size (5 x 7) with uninitialized memory: - -""" - -import torch -a = torch.empty(5, 7, dtype=torch.float) - -############################################################### -# Initialize a double tensor randomized with a normal distribution with mean=0, -# var=1: - -a = torch.randn(5, 7, dtype=torch.double) -print(a) -print(a.size()) - -############################################################### -# .. note:: -# ``torch.Size`` is in fact a tuple, so it supports the same operations -# -# Inplace / Out-of-place -# ---------------------- -# -# The first difference is that ALL operations on the tensor that operate -# in-place on it will have an ``_`` postfix. For example, ``add`` is the -# out-of-place version, and ``add_`` is the in-place version. - -a.fill_(3.5) -# a has now been filled with the value 3.5 - -b = a.add(4.0) -# a is still filled with 3.5 -# new tensor b is returned with values 3.5 + 4.0 = 7.5 - -print(a, b) - -############################################################### -# Some operations like ``narrow`` do not have in-place versions, and -# hence, ``.narrow_`` does not exist. Similarly, some operations like -# ``fill_`` do not have an out-of-place version, so ``.fill`` does not -# exist. -# -# Zero Indexing -# ------------- -# -# Another difference is that Tensors are zero-indexed. (In lua, tensors are -# one-indexed) - -b = a[0, 3] # select 1st row, 4th column from a - -############################################################### -# Tensors can be also indexed with Python's slicing - -b = a[:, 3:5] # selects all rows, 4th column and 5th column from a - -############################################################### -# No camel casing -# --------------- -# -# The next small difference is that all functions are now NOT camelCase -# anymore. For example ``indexAdd`` is now called ``index_add_`` - - -x = torch.ones(5, 5) -print(x) - -############################################################### -# - -z = torch.empty(5, 2) -z[:, 0] = 10 -z[:, 1] = 100 -print(z) - -############################################################### -# -x.index_add_(1, torch.tensor([4, 0], dtype=torch.long), z) -print(x) - -############################################################### -# Numpy Bridge -# ------------ -# -# Converting a torch Tensor to a numpy array and vice versa is a breeze. -# The torch Tensor and numpy array will share their underlying memory -# locations, and changing one will change the other. -# -# Converting torch Tensor to numpy Array -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -a = torch.ones(5) -print(a) - -############################################################### -# - -b = a.numpy() -print(b) - -############################################################### -# -a.add_(1) -print(a) -print(b) # see how the numpy array changed in value - - -############################################################### -# Converting numpy Array to torch Tensor -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -import numpy as np -a = np.ones(5) -b = torch.from_numpy(a) -np.add(a, 1, out=a) -print(a) -print(b) # see how changing the np array changed the torch Tensor automatically - -############################################################### -# All the Tensors on the CPU except a CharTensor support converting to -# NumPy and back. -# -# CUDA Tensors -# ------------ -# -# CUDA Tensors are nice and easy in pytorch, and transfering a CUDA tensor -# from the CPU to GPU will retain its underlying type. - -# let us run this cell only if CUDA is available -if torch.cuda.is_available(): - - # creates a LongTensor and transfers it - # to GPU as torch.cuda.LongTensor - a = torch.full((10,), 3, device=torch.device("cuda")) - print(type(a)) - b = a.to(torch.device("cpu")) - # transfers it to CPU, back to - # being a torch.LongTensor diff --git a/beginner_source/former_torchies/tensor_tutorial_old.rst b/beginner_source/former_torchies/tensor_tutorial_old.rst new file mode 100644 index 00000000000..939a6855c27 --- /dev/null +++ b/beginner_source/former_torchies/tensor_tutorial_old.rst @@ -0,0 +1,8 @@ +Tensors +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies_tutorial.rst b/beginner_source/former_torchies_tutorial.rst index e6ae59b7082..5071a62e73c 100644 --- a/beginner_source/former_torchies_tutorial.rst +++ b/beginner_source/former_torchies_tutorial.rst @@ -1,37 +1,9 @@ PyTorch for Former Torch Users ------------------------------- -**Author**: `Soumith Chintala `_ +============== +This tutorial is out of date. Please check out the PyTorch tutorials here: https://pytorch.org/tutorials/ -In this tutorial, you will learn the following: - -1. Using torch Tensors, and important difference against (Lua)Torch -2. Using the autograd package -3. Building neural networks - - - Building a ConvNet - - Building a Recurrent Net - -4. Use multiple GPUs - - -.. toctree:: - :hidden: - - /beginner/former_torchies/tensor_tutorial_old - /beginner/former_torchies/autograd_tutorial_old - /beginner/former_torchies/nnft_tutorial - /beginner/former_torchies/parallelism_tutorial - -.. galleryitem:: /beginner/former_torchies/tensor_tutorial_old.py - :figure: /_static/img/tensor_illustration_flat.png - -.. galleryitem:: /beginner/former_torchies/autograd_tutorial_old.py - -.. galleryitem:: /beginner/former_torchies/nnft_tutorial.py - :figure: /_static/img/torch-nn-vs-pytorch-nn.png - -.. galleryitem:: /beginner/former_torchies/parallelism_tutorial.py +You will be redirected in 3 seconds. .. raw:: html -
+ diff --git a/beginner_source/hybrid_frontend/README.txt b/beginner_source/hybrid_frontend/README.txt deleted file mode 100644 index 6ba5067ef0a..00000000000 --- a/beginner_source/hybrid_frontend/README.txt +++ /dev/null @@ -1,10 +0,0 @@ - Hybrid Frontend Tutorials - ------------------------- - -1. learning_hybrid_frontend_through_example_tutorial.py - Learning Hybrid Frontend Through Example - https://pytorch.org/tutorials/beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.html - -2. introduction_to_hybrid_frontend_tutorial.py - Introduction to Hybrid Frontend - https://pytorch.org/tutorials/beginner/hybrid_frontend/introduction_to_hybrid_frontend_tutorial.html diff --git a/beginner_source/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py b/beginner_source/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py deleted file mode 100644 index b767ba31d71..00000000000 --- a/beginner_source/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py +++ /dev/null @@ -1,270 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Learning Hybrid Frontend Syntax Through Example -=============================================== -**Author:** `Nathan Inkawhich `_ - -This document is meant to highlight the syntax of the Hybrid Frontend -through a non-code intensive example. The Hybrid Frontend is one of the -new shiny features of Pytorch 1.0 and provides an avenue for developers -to transition their models from **eager** to **graph** mode. PyTorch -users are very familiar with eager mode as it provides the ease-of-use -and flexibility that we all enjoy as researchers. Caffe2 users are more -aquainted with graph mode which has the benefits of speed, optimization -opportunities, and functionality in C++ runtime environments. The hybrid -frontend bridges the gap between the the two modes by allowing -researchers to develop and refine their models in eager mode (i.e. -PyTorch), then gradually transition the proven model to graph mode for -production, when speed and resouce consumption become critical. - -Hybrid Frontend Information ---------------------------- - -The process for transitioning a model to graph mode is as follows. -First, the developer constructs, trains, and tests the model in eager -mode. Then they incrementally **trace** and **script** each -function/module of the model with the Just-In-Time (JIT) compiler, at -each step verifying that the output is correct. Finally, when each of -the components of the top-level model have been traced and scripted, the -model itself is traced. At which point the model has been transitioned -to graph mode, and has a complete python-free representation. With this -representation, the model runtime can take advantage of high-performance -Caffe2 operators and graph based optimizations. - -Before we continue, it is important to understand the idea of tracing -and scripting, and why they are separate. The goal of **trace** and -**script** is the same, and that is to create a graph representation of -the operations taking place in a given function. The discrepency comes -from the flexibility of eager mode that allows for **data-dependent -control flows** within the model architecture. When a function does NOT -have a data-dependent control flow, it may be *traced* with -``torch.jit.trace``. However, when the function *has* a data-dependent -control flow it must be *scripted* with ``torch.jit.script``. We will -leave the details of the interworkings of the hybrid frontend for -another document, but the code example below will show the syntax of how -to trace and script different pure python functions and torch Modules. -Hopefully, you will find that using the hybrid frontend is non-intrusive -as it mostly involves adding decorators to the existing function and -class definitions. - -Motivating Example ------------------- - -In this example we will implement a strange math function that may be -logically broken up into four parts that do, and do not contain -data-dependent control flows. The purpose here is to show a non-code -intensive example where the use of the JIT is highlighted. This example -is a stand-in representation of a useful model, whose implementation has -been divided into various pure python functions and modules. - -The function we seek to implement, :math:`Y(x)`, is defined for -:math:`x \epsilon \mathbb{N}` as - -.. math:: - - z(x) = \Biggl \lfloor \\frac{\sqrt{\prod_{i=1}^{|2 x|}i}}{5} \Biggr \\rfloor - -.. math:: - - Y(x) = \\begin{cases} - \\frac{z(x)}{2} & \\text{if } z(x)\%2 == 0, \\\\ - z(x) & \\text{otherwise} - \end{cases} - -.. math:: - - \\begin{array}{| r | r |} \hline - x &1 &2 &3 &4 &5 &6 &7 \\\\ \hline - Y(x) &0 &0 &-5 &20 &190 &-4377 &-59051 \\\\ \hline - \end{array} - -As mentioned, the computation is split into four parts. Part one is the -simple tensor calculation of :math:`|2x|`, which can be traced. Part two -is the iterative product calculation that represents a data dependent -control flow to be scripted (the number of loop iteration depends on the -input at runtime). Part three is a trace-able -:math:`\lfloor \sqrt{a/5} \\rfloor` calculation. Finally, part 4 handles -the output cases depending on the value of :math:`z(x)` and must be -scripted due to the data dependency. Now, let's see how this looks in -code. - -Part 1 - Tracing a pure python function -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We can implement part one as a pure python function as below. Notice, to -trace this function we call ``torch.jit.trace`` and pass in the function -to be traced. Since the trace requires a dummy input of the expected -runtime type and shape, we also include the ``torch.rand`` to generate a -single valued torch tensor. - -""" - -import torch - -def fn(x): - return torch.abs(2*x) - -# This is how you define a traced function -# Pass in both the function to be traced and an example input to ``torch.jit.trace`` -traced_fn = torch.jit.trace(fn, torch.rand(())) - -###################################################################### -# Part 2 - Scripting a pure python function -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We can also implement part 2 as a pure python function where we -# iteratively compute the product. Since the number of iterations depends -# on the value of the input, we have a data dependent control flow, so the -# function must be scripted. We can script python functions simply with -# the ``@torch.jit.script`` decorator. -# - -# This is how you define a script function -# Apply this decorator directly to the function -@torch.jit.script -def script_fn(x): - z = torch.ones([1], dtype=torch.int64) - for i in range(int(x)): - z = z * (i + 1) - return z - - -###################################################################### -# Part 3 - Tracing a nn.Module -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Next, we will implement part 3 of the computation within the forward -# function of a ``torch.nn.Module``. This module may be traced, but rather -# than adding a decorator here, we will handle the tracing where the -# Module is constructed. Thus, the class definition is not changed at all. -# - -# This is a normal module that can be traced. -class TracedModule(torch.nn.Module): - def forward(self, x): - x = x.type(torch.float32) - return torch.floor(torch.sqrt(x) / 5.) - - -###################################################################### -# Part 4 - Scripting a nn.Module -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# In the final part of the computation we have a ``torch.nn.Module`` that -# must be scripted. To accomodate this, we inherit from -# ``torch.jit.ScriptModule`` and add the ``@torch.jit.script_method`` -# decorator to the forward function. -# - -# This is how you define a scripted module. -# The module should inherit from ScriptModule and the forward should have the -# script_method decorator applied to it. -class ScriptModule(torch.jit.ScriptModule): - @torch.jit.script_method - def forward(self, x): - r = -x - if int(torch.fmod(x, 2.0)) == 0.0: - r = x / 2.0 - return r - - -###################################################################### -# Top-Level Module -# ~~~~~~~~~~~~~~~~ -# -# Now we will put together the pieces of the computation via a top level -# module called ``Net``. In the constructor, we will instantiate the -# ``TracedModule`` and ``ScriptModule`` as attributes. This must be done -# because we ultimately want to trace/script the top level module, and -# having the traced/scripted modules as attributes allows the Net to -# inherit the required submodules' parameters. Notice, this is where we -# actually trace the ``TracedModule`` by calling ``torch.jit.trace()`` and -# providing the necessary dummy input. Also notice that the -# ``ScriptModule`` is constructed as normal because we handled the -# scripting in the class definition. -# -# Here we can also print the graphs created for each individual part of -# the computation. The printed graphs allows us to see how the JIT -# ultimately interpreted the functions as graph computations. -# -# Finally, we define the ``forward`` function for the Net module where we -# run the input data ``x`` through the four parts of the computation. -# There is no strange syntax here and we call the traced and scripted -# modules and functions as expected. -# - -# This is a demonstration net that calls all of the different types of -# methods and functions -class Net(torch.nn.Module): - def __init__(self): - super(Net, self).__init__() - # Modules must be attributes on the Module because if you want to trace - # or script this Module, we must be able to inherit the submodules' - # params. - self.traced_module = torch.jit.trace(TracedModule(), torch.rand(())) - self.script_module = ScriptModule() - - print('traced_fn graph', traced_fn.graph) - print('script_fn graph', script_fn.graph) - print('TracedModule graph', self.traced_module.__getattr__('forward').graph) - print('ScriptModule graph', self.script_module.__getattr__('forward').graph) - - def forward(self, x): - # Call a traced function - x = traced_fn(x) - - # Call a script function - x = script_fn(x) - - # Call a traced submodule - x = self.traced_module(x) - - # Call a scripted submodule - x = self.script_module(x) - - return x - - -###################################################################### -# Running the Model -# ~~~~~~~~~~~~~~~~~ -# -# All that's left to do is construct the Net and compute the output -# through the forward function. Here, we use :math:`x=5` as the test input -# value and expect :math:`Y(x)=190.` Also, check out the graphs that were -# printed during the construction of the Net. -# - -# Instantiate this net and run it -n = Net() -print(n(torch.tensor([5]))) # 190. - - -###################################################################### -# Tracing the Top-Level Model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The last part of the example is to trace the top-level module, ``Net``. -# As mentioned previously, since the traced/scripted modules are -# attributes of Net, we are able to trace ``Net`` as it inherits the -# parameters of the traced/scripted submodules. Note, the syntax for -# tracing Net is identical to the syntax for tracing ``TracedModule``. -# Also, check out the graph that is created. -# - -n_traced = torch.jit.trace(n, torch.tensor([5])) -print(n_traced(torch.tensor([5]))) -print('n_traced graph', n_traced.graph) - - -###################################################################### -# Hopefully, this document can serve as an introduction to the hybrid -# frontend as well as a syntax reference guide for more experienced users. -# Also, there are a few things to keep in mind when using the hybrid -# frontend. There is a constraint that traced/scripted methods must be -# written in a restricted subset of python, as features like generators, -# defs, and Python data structures are not supported. As a workaround, the -# scripting model *is* designed to work with both traced and non-traced -# code which means you can call non-traced code from traced functions. -# However, such a model may not be exported to ONNX. -# diff --git a/beginner_source/hybrid_frontend_tutorial.rst b/beginner_source/hybrid_frontend_tutorial.rst deleted file mode 100644 index 89209b0affc..00000000000 --- a/beginner_source/hybrid_frontend_tutorial.rst +++ /dev/null @@ -1,21 +0,0 @@ -Hybrid Frontend Tutorials -------------------------- -**Authors**: `Nathan Inkawhich `_ and `Matthew Inkawhich `_ - -In this set of tutorials, you will learn the following: - -1. What the hybrid frontend is and the suggested workflow -2. Basic syntax -3. How to transition an eager model to graph mode - - -.. toctree:: - :hidden: - - /beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial - -.. galleryitem:: /beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py - -.. raw:: html - -
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 2680d1a23c3..dd3fe65699e 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -10,7 +10,7 @@ Fortunately, there are tools that help with finding the best combination of parameters. `Ray Tune `_ is an industry standard tool for distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search -algorithms, integrates with TensorBoard and other analysis libraries, and natively +algorithms, integrates with various analysis libraries, and natively supports distributed training through `Ray's distributed machine learning engine `_. @@ -184,7 +184,7 @@ def forward(self, x): # inputs, labels = inputs.to(device), labels.to(device) # # The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray -# also supports `fractional GPUs `_ +# also supports `fractional GPUs `_ # so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back # to that later. # diff --git a/beginner_source/introyt.rst b/beginner_source/introyt.rst index 841cc3cd28d..9b2a630c245 100644 --- a/beginner_source/introyt.rst +++ b/beginner_source/introyt.rst @@ -1,29 +1,10 @@ -`Introduction `_ || -`Tensors `_ || -`Autograd `_ || -`Building Models `_ || -`TensorBoard Support `_ || -`Training Models `_ || -`Model Understanding `_ - Introduction to PyTorch - YouTube Series ======================================== -Authors: -`Brad Heintz `_ - -This tutorial follows along with the `PyTorch Beginner Series `_ on YouTube. - -`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.` - -Running the Tutorial Code -------------------------- -You can run this tutorial in a couple of ways: +This page has been moved. -- **In the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU* -- **Locally**: This option requires you to setup PyTorch and torchvision first on your local machine (`installation instructions `_). Download the notebook or copy the code into your favorite IDE. +Redirecting now... -.. include:: /beginner_source/introyt/tocyt.txt +.. raw:: html -.. toctree:: - :hidden: + diff --git a/beginner_source/introyt/README.txt b/beginner_source/introyt/README.txt index ebe8f2e9c21..b90d269cfab 100644 --- a/beginner_source/introyt/README.txt +++ b/beginner_source/introyt/README.txt @@ -1,7 +1,7 @@ Introduction to PyTorch on YouTube ---------------------------------- -1. introyt.rst +1. introyt.py Introduction to PyTorch - Youtube Series https://pytorch.org/tutorials/beginner/introyt/introyt.html diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py index 74675070708..c01befb40cc 100644 --- a/beginner_source/introyt/introyt1_tutorial.py +++ b/beginner_source/introyt/introyt1_tutorial.py @@ -303,22 +303,21 @@ def num_flat_features(self, x): # The values passed to the transform are the means (first tuple) and the # standard deviations (second tuple) of the rgb values of the images in # the dataset. You can calculate these values yourself by running these -# few lines of code: -# ``` -# from torch.utils.data import ConcatDataset -# transform = transforms.Compose([transforms.ToTensor()]) -# trainset = torchvision.datasets.CIFAR10(root='./data', train=True, +# few lines of code:: +# +# from torch.utils.data import ConcatDataset +# transform = transforms.Compose([transforms.ToTensor()]) +# trainset = torchvision.datasets.CIFAR10(root='./data', train=True, # download=True, transform=transform) # -# #stack all train images together into a tensor of shape -# #(50000, 3, 32, 32) -# x = torch.stack([sample[0] for sample in ConcatDataset([trainset])]) +# # stack all train images together into a tensor of shape +# # (50000, 3, 32, 32) +# x = torch.stack([sample[0] for sample in ConcatDataset([trainset])]) # -# #get the mean of each channel -# mean = torch.mean(x, dim=(0,2,3)) #tensor([0.4914, 0.4822, 0.4465]) -# std = torch.std(x, dim=(0,2,3)) #tensor([0.2470, 0.2435, 0.2616]) -# -# ``` +# # get the mean of each channel +# mean = torch.mean(x, dim=(0,2,3)) # tensor([0.4914, 0.4822, 0.4465]) +# std = torch.std(x, dim=(0,2,3)) # tensor([0.2470, 0.2435, 0.2616]) +# # # There are many more transforms available, including cropping, centering, # rotation, and reflection. diff --git a/beginner_source/introyt/introyt_index.py b/beginner_source/introyt/introyt_index.py new file mode 100644 index 00000000000..9ef60574dd9 --- /dev/null +++ b/beginner_source/introyt/introyt_index.py @@ -0,0 +1,38 @@ +""" +`Introduction `_ || +`Tensors `_ || +`Autograd `_ || +`Building Models `_ || +`TensorBoard Support `_ || +`Training Models `_ || +`Model Understanding `_ + +Introduction to PyTorch - YouTube Series +======================================== + +Authors: +`Brad Heintz `_ + +This tutorial follows along with the `PyTorch Beginner Series `_ on YouTube. + +`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.` + +Running the Tutorial Code +------------------------- +You can run this tutorial in a couple of ways: + +- **On the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU* +- **Locally**: This option requires you to set up PyTorch and torchvision on your local machine (`installation instructions `_). Download the notebook or copy the code into your favorite IDE. + +.. toctree:: + :maxdepth: 2 + :hidden: + + introyt1_tutorial + tensors_deeper_tutorial + autogradyt_tutorial + modelsyt_tutorial + tensorboardyt_tutorial + trainingyt + captumyt +""" diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py index b97accc7755..61c27d5c543 100644 --- a/beginner_source/introyt/modelsyt_tutorial.py +++ b/beginner_source/introyt/modelsyt_tutorial.py @@ -311,9 +311,7 @@ def forward(self, sentence): # ``TransformerDecoder``) and subcomponents (``TransformerEncoderLayer``, # ``TransformerDecoderLayer``). For details, check out the # `documentation `__ -# on transformer classes, and the relevant -# `tutorial `__ -# on pytorch.org. +# on transformer classes. # # Other Layers and Functions # -------------------------- diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py index b5f9dc0bc9e..4d118ad4030 100644 --- a/beginner_source/introyt/tensors_deeper_tutorial.py +++ b/beginner_source/introyt/tensors_deeper_tutorial.py @@ -44,7 +44,7 @@ ########################################################################## -# Let’s unpack what we just did: +# Let’s upack what we just did: # # - We created a tensor using one of the numerous factory methods # attached to the ``torch`` module. @@ -448,17 +448,19 @@ m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix print('\nVectors & Matrices:') -print(torch.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) +print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) print(m1) -m3 = torch.matmul(m1, m2) +m3 = torch.linalg.matmul(m1, m2) print(m3) # 3 times m1 -print(torch.svd(m3)) # singular value decomposition +print(torch.linalg.svd(m3)) # singular value decomposition ################################################################################## # This is a small sample of operations. For more details and the full inventory of # math functions, have a look at the # `documentation `__. +# For more details and the full inventory of linear algebra operations, have a +# look at this `documentation `__. # # Altering Tensors in Place # ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -630,34 +632,33 @@ # does this *without* changing ``a`` - you can see that when we print # ``a`` again at the end, it retains its ``requires_grad=True`` property. # -# Moving to GPU +# Moving to `Accelerator `__ # ------------- # -# One of the major advantages of PyTorch is its robust acceleration on -# CUDA-compatible Nvidia GPUs. (“CUDA” stands for *Compute Unified Device -# Architecture*, which is Nvidia’s platform for parallel computing.) So -# far, everything we’ve done has been on CPU. How do we move to the faster +# One of the major advantages of PyTorch is its robust acceleration on an +# `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. +# So far, everything we’ve done has been on CPU. How do we move to the faster # hardware? # -# First, we should check whether a GPU is available, with the +# First, we should check whether an accelerator is available, with the # ``is_available()`` method. # # .. note:: -# If you do not have a CUDA-compatible GPU and CUDA drivers -# installed, the executable cells in this section will not execute any -# GPU-related code. +# If you do not have an accelerator, the executable cells in this section will not execute any +# accelerator-related code. # -if torch.cuda.is_available(): - print('We have a GPU!') +if torch.accelerator.is_available(): + print('We have an accelerator!') else: print('Sorry, CPU only.') ########################################################################## -# Once we’ve determined that one or more GPUs is available, we need to put -# our data someplace where the GPU can see it. Your CPU does computation -# on data in your computer’s RAM. Your GPU has dedicated memory attached +# Once we’ve determined that one or more accelerators is available, we need to put +# our data someplace where the accelerator can see it. Your CPU does computation +# on data in your computer’s RAM. Your accelerator has dedicated memory attached # to it. Whenever you want to perform a computation on a device, you must # move *all* the data needed for that computation to memory accessible by # that device. (Colloquially, “moving the data to memory accessible by the @@ -667,8 +668,8 @@ # may do it at creation time: # -if torch.cuda.is_available(): - gpu_rand = torch.rand(2, 2, device='cuda') +if torch.accelerator.is_available(): + gpu_rand = torch.rand(2, 2, device=torch.accelerator.current_accelerator()) print(gpu_rand) else: print('Sorry, CPU only.') @@ -676,25 +677,22 @@ ########################################################################## # By default, new tensors are created on the CPU, so we have to specify -# when we want to create our tensor on the GPU with the optional +# when we want to create our tensor on the accelerator with the optional # ``device`` argument. You can see when we print the new tensor, PyTorch # informs us which device it’s on (if it’s not on CPU). # -# You can query the number of GPUs with ``torch.cuda.device_count()``. If -# you have more than one GPU, you can specify them by index: +# You can query the number of accelerators with ``torch.accelerator.device_count()``. If +# you have more than one accelerator, you can specify them by index, take CUDA for example: # ``device='cuda:0'``, ``device='cuda:1'``, etc. # # As a coding practice, specifying our devices everywhere with string # constants is pretty fragile. In an ideal world, your code would perform -# robustly whether you’re on CPU or GPU hardware. You can do this by +# robustly whether you’re on CPU or accelerator hardware. You can do this by # creating a device handle that can be passed to your tensors instead of a # string: # -if torch.cuda.is_available(): - my_device = torch.device('cuda') -else: - my_device = torch.device('cpu') +my_device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu') print('Device: {}'.format(my_device)) x = torch.rand(2, 2, device=my_device) @@ -716,12 +714,12 @@ # It is important to know that in order to do computation involving two or # more tensors, *all of the tensors must be on the same device*. The # following code will throw a runtime error, regardless of whether you -# have a GPU device available: +# have an accelerator device available, take CUDA for example: # # .. code-block:: python # # x = torch.rand(2, 2) -# y = torch.rand(2, 2, device='gpu') +# y = torch.rand(2, 2, device='cuda') # z = x + y # exception will be thrown # diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py index 4601352ff03..19d1553e7a0 100644 --- a/beginner_source/knowledge_distillation_tutorial.py +++ b/beginner_source/knowledge_distillation_tutorial.py @@ -37,8 +37,10 @@ import torchvision.transforms as transforms import torchvision.datasets as datasets -# Check if GPU is available, and if not, use the CPU -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# Check if the current `accelerator `__ +# is available, and if not, use the CPU +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") ###################################################################### # Loading CIFAR-10 @@ -352,7 +354,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin # Cosine loss minimization run # ---------------------------- # Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients. -# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization. +# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization. # Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers. # Our goal is to convey information from the teacher's representation to the student by including a naive loss function, # whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases. diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py index b45200fd495..e04815bd27e 100644 --- a/beginner_source/nn_tutorial.py +++ b/beginner_source/nn_tutorial.py @@ -31,7 +31,7 @@ # MNIST data setup # ---------------- # -# We will use the classic `MNIST `_ dataset, +# We will use the classic `MNIST `_ dataset, # which consists of black-and-white images of hand-drawn digits (between 0 and 9). # # We will use `pathlib `_ @@ -132,7 +132,7 @@ # we'll write `log_softmax` and use it. Remember: although PyTorch # provides lots of prewritten loss functions, activation functions, and # so forth, you can easily write your own using plain python. PyTorch will -# even create fast GPU or vectorized CPU code for your function +# even create fast accelerator or vectorized CPU code for your function # automatically. def log_softmax(x): @@ -827,28 +827,25 @@ def __iter__(self): fit(epochs, model, loss_func, opt, train_dl, valid_dl) ############################################################################### -# Using your GPU +# Using your `Accelerator `__ # --------------- # -# If you're lucky enough to have access to a CUDA-capable GPU (you can +# If you're lucky enough to have access to an accelerator such as CUDA (you can # rent one for about $0.50/hour from most cloud providers) you can -# use it to speed up your code. First check that your GPU is working in +# use it to speed up your code. First check that your accelerator is working in # Pytorch: -print(torch.cuda.is_available()) +# If the current accelerator is available, we will use it. Otherwise, we use the CPU. +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") -############################################################################### -# And then create a device object for it: - -dev = torch.device( - "cuda") if torch.cuda.is_available() else torch.device("cpu") ############################################################################### -# Let's update ``preprocess`` to move batches to the GPU: +# Let's update ``preprocess`` to move batches to the accelerator: def preprocess(x, y): - return x.view(-1, 1, 28, 28).to(dev), y.to(dev) + return x.view(-1, 1, 28, 28).to(device), y.to(device) train_dl, valid_dl = get_data(train_ds, valid_ds, bs) @@ -856,9 +853,9 @@ def preprocess(x, y): valid_dl = WrappedDataLoader(valid_dl, preprocess) ############################################################################### -# Finally, we can move our model to the GPU. +# Finally, we can move our model to the accelerator. -model.to(dev) +model.to(device) opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9) ############################################################################### diff --git a/beginner_source/onnx/README.txt b/beginner_source/onnx/README.txt index 61485869c99..96004a239ea 100644 --- a/beginner_source/onnx/README.txt +++ b/beginner_source/onnx/README.txt @@ -3,12 +3,16 @@ ONNX 1. intro_onnx.py Introduction to ONNX - https://pytorch.org/tutorials/onnx/intro_onnx.html + https://pytorch.org/tutorials/beginner/onnx/intro_onnx.html 2. export_simple_model_to_onnx_tutorial.py Exporting a PyTorch model to ONNX https://pytorch.org/tutorials/beginner/onnx/export_simple_model_to_onnx_tutorial.html 3. onnx_registry_tutorial.py - Extending the ONNX Registry + Extending the ONNX exporter operator support https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html + +4. export_control_flow_model_to_onnx_tutorial.py + Export a model with control flow to ONNX + https://pytorch.org/tutorials/beginner/onnx/export_control_flow_model_to_onnx_tutorial.html \ No newline at end of file diff --git a/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py b/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py new file mode 100644 index 00000000000..c8057727132 --- /dev/null +++ b/beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +""" +`Introduction to ONNX `_ || +`Exporting a PyTorch model to ONNX `_ || +`Extending the ONNX exporter operator support `_ || +**`Export a model with control flow to ONNX** + +Export a model with control flow to ONNX +======================================== + +**Author**: `Xavier Dupré `_ +""" + + +############################################################################### +# Overview +# -------- +# +# This tutorial demonstrates how to handle control flow logic while exporting +# a PyTorch model to ONNX. It highlights the challenges of exporting +# conditional statements directly and provides solutions to circumvent them. +# +# Conditional logic cannot be exported into ONNX unless they refactored +# to use :func:`torch.cond`. Let's start with a simple model +# implementing a test. +# +# What you will learn: +# +# - How to refactor the model to use :func:`torch.cond` for exporting. +# - How to export a model with control flow logic to ONNX. +# - How to optimize the exported model using the ONNX optimizer. +# +# Prerequisites +# ~~~~~~~~~~~~~ +# +# * ``torch >= 2.6`` + + +import torch + +############################################################################### +# Define the Models +# ----------------- +# +# Two models are defined: +# +# ``ForwardWithControlFlowTest``: A model with a forward method containing an +# if-else conditional. +# +# ``ModelWithControlFlowTest``: A model that incorporates ``ForwardWithControlFlowTest`` +# as part of a simple MLP. The models are tested with +# a random input tensor to confirm they execute as expected. + +class ForwardWithControlFlowTest(torch.nn.Module): + def forward(self, x): + if x.sum(): + return x * 2 + return -x + + +class ModelWithControlFlowTest(torch.nn.Module): + def __init__(self): + super().__init__() + self.mlp = torch.nn.Sequential( + torch.nn.Linear(3, 2), + torch.nn.Linear(2, 1), + ForwardWithControlFlowTest(), + ) + + def forward(self, x): + out = self.mlp(x) + return out + + +model = ModelWithControlFlowTest() + + +############################################################################### +# Exporting the Model: First Attempt +# ---------------------------------- +# +# Exporting this model using torch.export.export fails because the control +# flow logic in the forward pass creates a graph break that the exporter cannot +# handle. This behavior is expected, as conditional logic not written using +# :func:`torch.cond` is unsupported. +# +# A try-except block is used to capture the expected failure during the export +# process. If the export unexpectedly succeeds, an ``AssertionError`` is raised. + +x = torch.randn(3) +model(x) + +try: + torch.export.export(model, (x,), strict=False) + raise AssertionError("This export should failed unless PyTorch now supports this model.") +except Exception as e: + print(e) + + +############################################################################### +# Suggested Patch: Refactoring with :func:`torch.cond` +# -------------------------------------------- +# +# To make the control flow exportable, the tutorial demonstrates replacing the +# forward method in ``ForwardWithControlFlowTest`` with a refactored version that +# uses :func:`torch.cond``. +# +# Details of the Refactoring: +# +# Two helper functions (identity2 and neg) represent the branches of the conditional logic: +# * :func:`torch.cond`` is used to specify the condition and the two branches along with the input arguments. +# * The updated forward method is then dynamically assigned to the ``ForwardWithControlFlowTest`` instance within the model. A list of submodules is printed to confirm the replacement. + +def new_forward(x): + def identity2(x): + return x * 2 + + def neg(x): + return -x + + return torch.cond(x.sum() > 0, identity2, neg, (x,)) + + +print("the list of submodules") +for name, mod in model.named_modules(): + print(name, type(mod)) + if isinstance(mod, ForwardWithControlFlowTest): + mod.forward = new_forward + +############################################################################### +# Let's see what the FX graph looks like. + +print(torch.export.export(model, (x,), strict=False)) + +############################################################################### +# Let's export again. + +onnx_program = torch.onnx.export(model, (x,), dynamo=True) +print(onnx_program.model) + + +############################################################################### +# We can optimize the model and get rid of the model local functions created to capture the control flow branches. + +onnx_program.optimize() +print(onnx_program.model) + +############################################################################### +# Conclusion +# ---------- +# +# This tutorial demonstrates the challenges of exporting models with conditional +# logic to ONNX and presents a practical solution using :func:`torch.cond`. +# While the default exporters may fail or produce imperfect graphs, refactoring the +# model's logic ensures compatibility and generates a faithful ONNX representation. +# +# By understanding these techniques, we can overcome common pitfalls when +# working with control flow in PyTorch models and ensure smooth integration with ONNX workflows. +# +# Further reading +# --------------- +# +# The list below refers to tutorials that ranges from basic examples to advanced scenarios, +# not necessarily in the order they are listed. +# Feel free to jump directly to specific topics of your interest or +# sit tight and have fun going through all of them to learn all there is about the ONNX exporter. +# +# .. include:: /beginner_source/onnx/onnx_toc.txt +# +# .. toctree:: +# :hidden: +# diff --git a/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py index 760c40ab43c..8948cbaa2c1 100644 --- a/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py +++ b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py @@ -2,18 +2,18 @@ """ `Introduction to ONNX `_ || **Exporting a PyTorch model to ONNX** || -`Extending the ONNX Registry `_ +`Extending the ONNX exporter operator support `_ || +`Export a model with control flow to ONNX `_ Export a PyTorch model to ONNX ============================== -**Author**: `Thiago Crepaldi `_ +**Author**: `Ti-Tai Wang `_, `Justin Chu `_, `Thiago Crepaldi `_. .. note:: - As of PyTorch 2.1, there are two versions of ONNX Exporter. - - * ``torch.onnx.dynamo_export`` is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0 - * ``torch.onnx.export`` is based on TorchScript backend and has been available since PyTorch 1.2.0 + Starting with PyTorch 2.5, there are two ONNX Exporter options available. + * ``torch.onnx.export(..., dynamo=True)`` is the recommended exporter that leverages ``torch.export`` and Torch FX for graph capture. + * ``torch.onnx.export`` is the legacy approach that relies on the deprecated TorchScript and is no longer recommended for use. """ @@ -21,7 +21,7 @@ # In the `60 Minute Blitz `_, # we had the opportunity to learn about PyTorch at a high level and train a small neural network to classify images. # In this tutorial, we are going to expand this to describe how to convert a model defined in PyTorch into the -# ONNX format using TorchDynamo and the ``torch.onnx.dynamo_export`` ONNX exporter. +# ONNX format using the ``torch.onnx.export(..., dynamo=True)`` ONNX exporter. # # While PyTorch is great for iterating on the development of models, the model can be deployed to production # using different formats, including `ONNX `_ (Open Neural Network Exchange)! @@ -47,8 +47,7 @@ # # .. code-block:: bash # -# pip install onnx -# pip install onnxscript +# pip install --upgrade onnx onnxscript # # 2. Author a simple image classifier model # ----------------------------------------- @@ -62,17 +61,16 @@ import torch.nn.functional as F -class MyModel(nn.Module): - +class ImageClassifierModel(nn.Module): def __init__(self): - super(MyModel, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(1, 6, 5) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) - def forward(self, x): + def forward(self, x: torch.Tensor): x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) x = F.max_pool2d(F.relu(self.conv2(x)), 2) x = torch.flatten(x, 1) @@ -81,6 +79,7 @@ def forward(self, x): x = self.fc3(x) return x + ###################################################################### # 3. Export the model to ONNX format # ---------------------------------- @@ -88,9 +87,10 @@ def forward(self, x): # Now that we have our model defined, we need to instantiate it and create a random 32x32 input. # Next, we can export the model to ONNX format. -torch_model = MyModel() -torch_input = torch.randn(1, 1, 32, 32) -onnx_program = torch.onnx.dynamo_export(torch_model, torch_input) +torch_model = ImageClassifierModel() +# Create example inputs for exporting the model. The inputs should be a tuple of tensors. +example_inputs = (torch.randn(1, 1, 32, 32),) +onnx_program = torch.onnx.export(torch_model, example_inputs, dynamo=True) ###################################################################### # As we can see, we didn't need any code change to the model. @@ -102,13 +102,14 @@ def forward(self, x): # Although having the exported model loaded in memory is useful in many applications, # we can save it to disk with the following code: -onnx_program.save("my_image_classifier.onnx") +onnx_program.save("image_classifier_model.onnx") ###################################################################### # You can load the ONNX file back into memory and check if it is well formed with the following code: import onnx -onnx_model = onnx.load("my_image_classifier.onnx") + +onnx_model = onnx.load("image_classifier_model.onnx") onnx.checker.check_model(onnx_model) ###################################################################### @@ -124,10 +125,10 @@ def forward(self, x): # :align: center # # -# Once Netron is open, we can drag and drop our ``my_image_classifier.onnx`` file into the browser or select it after +# Once Netron is open, we can drag and drop our ``image_classifier_model.onnx`` file into the browser or select it after # clicking the **Open model** button. # -# .. image:: ../../_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png +# .. image:: ../../_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png # :width: 50% # # @@ -155,18 +156,18 @@ def forward(self, x): import onnxruntime -onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input) -print(f"Input length: {len(onnx_input)}") -print(f"Sample input: {onnx_input}") - -ort_session = onnxruntime.InferenceSession("./my_image_classifier.onnx", providers=['CPUExecutionProvider']) +onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs] +print(f"Input length: {len(onnx_inputs)}") +print(f"Sample input: {onnx_inputs}") -def to_numpy(tensor): - return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() +ort_session = onnxruntime.InferenceSession( + "./image_classifier_model.onnx", providers=["CPUExecutionProvider"] +) -onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} +onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in zip(ort_session.get_inputs(), onnx_inputs)} -onnxruntime_outputs = ort_session.run(None, onnxruntime_input) +# ONNX Runtime returns a list of outputs +onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0] #################################################################### # 7. Compare the PyTorch results with the ones from the ONNX Runtime @@ -178,8 +179,7 @@ def to_numpy(tensor): # For that, we need to execute the PyTorch model with the same input and compare the results with ONNX Runtime's. # Before comparing the results, we need to convert the PyTorch's output to match ONNX's format. -torch_outputs = torch_model(torch_input) -torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) +torch_outputs = torch_model(*example_inputs) assert len(torch_outputs) == len(onnxruntime_outputs) for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): @@ -209,4 +209,4 @@ def to_numpy(tensor): # # .. toctree:: # :hidden: -# \ No newline at end of file +# diff --git a/beginner_source/onnx/intro_onnx.py b/beginner_source/onnx/intro_onnx.py index b5cbafc1c64..ecb0be97bf2 100644 --- a/beginner_source/onnx/intro_onnx.py +++ b/beginner_source/onnx/intro_onnx.py @@ -1,13 +1,14 @@ """ **Introduction to ONNX** || `Exporting a PyTorch model to ONNX `_ || -`Extending the ONNX Registry `_ +`Extending the ONNX exporter operator support `_ || +`Export a model with control flow to ONNX `_ Introduction to ONNX ==================== Authors: -`Thiago Crepaldi `_, +`Ti-Tai Wang `_, `Thiago Crepaldi `_. `Open Neural Network eXchange (ONNX) `_ is an open standard format for representing machine learning models. The ``torch.onnx`` module provides APIs to @@ -18,34 +19,28 @@ `runtimes that support ONNX `_, including Microsoft's `ONNX Runtime `_. -.. note:: - Currently, there are two flavors of ONNX exporter APIs, - but this tutorial will focus on the ``torch.onnx.dynamo_export``. - -The TorchDynamo engine is leveraged to hook into Python's frame evaluation API and dynamically rewrite its -bytecode into an `FX graph `_. -The resulting FX Graph is polished before it is finally translated into an -`ONNX graph `_. - -The main advantage of this approach is that the `FX graph `_ is captured using -bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques. +When setting ``dynamo=True``, the exporter will use `torch.export `_ to capture an ``ExportedProgram``, +before translating the graph into ONNX representations. This approach is the new and recommended way to export models to ONNX. +It works with PyTorch 2.0 features more robustly, has better support for newer ONNX operator sets, and consumes less resources +to make exporting larger models possible. Dependencies ------------ -PyTorch 2.1.0 or newer is required. +PyTorch 2.5.0 or newer is required. The ONNX exporter depends on extra Python packages: - `ONNX `_ standard library - `ONNX Script `_ library that enables developers to author ONNX operators, - functions and models using a subset of Python in an expressive, and yet simple fashion. + functions and models using a subset of Python in an expressive, and yet simple fashion + - `ONNX Runtime `_ accelerated machine learning library. They can be installed through `pip `_: .. code-block:: bash - pip install --upgrade onnx onnxscript + pip install --upgrade onnx onnxscript onnxruntime To validate the installation, run the following commands: @@ -57,8 +52,6 @@ import onnxscript print(onnxscript.__version__) - from onnxscript import opset18 # opset 18 is the latest (and only) supported version for now - import onnxruntime print(onnxruntime.__version__) @@ -77,4 +70,4 @@ .. toctree:: :hidden: -""" +""" \ No newline at end of file diff --git a/beginner_source/onnx/onnx_registry_tutorial.py b/beginner_source/onnx/onnx_registry_tutorial.py index dfb54d60974..e82bc6257de 100644 --- a/beginner_source/onnx/onnx_registry_tutorial.py +++ b/beginner_source/onnx/onnx_registry_tutorial.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- - """ `Introduction to ONNX `_ || `Exporting a PyTorch model to ONNX `_ || -**Extending the ONNX Registry** +**Extending the ONNX exporter operator support** || +`Export a model with control flow to ONNX `_ -Extending the ONNX Registry -=========================== +Extending the ONNX Exporter Operator Support +============================================ -**Authors:** Ti-Tai Wang (titaiwang@microsoft.com) +**Authors:** `Ti-Tai Wang `_, `Justin Chu `_ """ @@ -16,439 +16,242 @@ # Overview # -------- # -# This tutorial is an introduction to ONNX registry, which empowers users to implement new ONNX operators -# or even replace existing operators with a new implementation. +# This tutorial describes how you can create ONNX implementation for unsupported PyTorch operators +# or replace existing implementation with your own. +# +# We will cover three scenarios that require extending the ONNX exporter's operator support: +# +# * Overriding the implementation of an existing PyTorch operator +# * Using custom ONNX operators +# * Supporting a custom PyTorch operator # -# During the model export to ONNX, the PyTorch model is lowered to an intermediate -# representation composed of `ATen operators `_. -# While ATen operators are maintained by PyTorch core team, it is the responsibility of the ONNX exporter team -# to independently implement each of these operators to ONNX through `ONNX Script `_. -# The users can also replace the behavior implemented by the ONNX exporter team with their own implementation -# to fix bugs or improve performance for a specific ONNX runtime. +# What you will learn: # -# The ONNX Registry manages the mapping between PyTorch operators and the ONNX operators counterparts and provides -# APIs to extend the registry. +# - How to override or add support for PyTorch operators in ONNX. +# - How to integrate custom ONNX operators for specialized runtimes. +# - How to implement and translate custom PyTorch operators to ONNX. # -# In this tutorial, we will cover three scenarios that require extending the ONNX registry with custom operators: +# Prerequisites +# ~~~~~~~~~~~~~ # -# * Unsupported ATen operators -# * Custom operators with existing ONNX Runtime support -# * Custom operators without ONNX Runtime support +# Before starting this tutorial, make sure you have completed the following prerequisites: +# +# * ``torch >= 2.6`` +# * The target PyTorch operator +# * Completed the +# `ONNX Script tutorial `_ +# before proceeding +# * The implementation of the operator using `ONNX Script `__ # -# Unsupported ATen operators -# -------------------------- +# Overriding the implementation of an existing PyTorch operator +# ------------------------------------------------------------- # -# Although the ONNX exporter team does their best efforts to support all ATen operators, some of them +# Although the ONNX exporter team does their best efforts to support all PyTorch operators, some of them # might not be supported yet. In this section, we will demonstrate how you can add -# unsupported ATen operators to the ONNX Registry. +# unsupported PyTorch operators to the ONNX Registry. # # .. note:: -# The steps to implement unsupported ATen operators are the same to replace the implementation of an existing -# ATen operator with a custom implementation. -# Because we don't actually have an unsupported ATen operator to use in this tutorial, we are going to leverage -# this and replace the implementation of ``aten::add.Tensor`` with a custom implementation the same way we would -# if the operator was not present in the ONNX Registry. +# The steps to implement unsupported PyTorch operators are the same as those for replacing the implementation of an existing +# PyTorch operator with a custom one. +# Because we don't actually have an unsupported PyTorch operator to use in this tutorial, we are going to leverage +# this and replace the implementation of ``torch.ops.aten.add.Tensor`` with a custom implementation the same way we would +# if the operator was not implemented by the ONNX exporter. # # When a model cannot be exported to ONNX due to an unsupported operator, the ONNX exporter will show an error message # similar to: # # .. code-block:: python # -# RuntimeErrorWithDiagnostic: Unsupported FX nodes: {'call_function': ['aten.add.Tensor']}. -# -# The error message indicates that the fully qualified name of unsupported ATen operator is ``aten::add.Tensor``. -# The fully qualified name of an operator is composed of the namespace, operator name, and overload following -# the format ``namespace::operator_name.overload``. -# -# To add support for an unsupported ATen operator or to replace the implementation for an existing one, we need: -# -# * The fully qualified name of the ATen operator (e.g. ``aten::add.Tensor``). -# This information is always present in the error message as show above. -# * The implementation of the operator using `ONNX Script `__. -# ONNX Script is a prerequisite for this tutorial. Please make sure you have read the -# `ONNX Script tutorial `_ -# before proceeding. -# -# Because ``aten::add.Tensor`` is already supported by the ONNX Registry, we will demonstrate how to replace it with a -# custom implementation, but keep in mind that the same steps apply to support new unsupported ATen operators. -# -# This is possible because the :class:`OnnxRegistry` allows users to override an operator registration. -# We will override the registration of ``aten::add.Tensor`` with our custom implementation and verify it exists. +# No decompositions registered for [...] # +# The error message indicates that the unsupported PyTorch operator is ``torch.ops.aten.add.Tensor``. +# The operator is of type ````, and this operator is what we will use as the +# target to register our custom implementation. import torch -import onnxruntime import onnxscript -from onnxscript import opset18 # opset 18 is the latest (and only) supported version for now -class Model(torch.nn.Module): - def forward(self, input_x, input_y): - return torch.ops.aten.add(input_x, input_y) # generates a aten::add.Tensor node +# Opset 18 is the standard supported version as of PyTorch 2.6 +from onnxscript import opset18 as op -input_add_x = torch.randn(3, 4) -input_add_y = torch.randn(3, 4) -aten_add_model = Model() +# Create a model that uses the operator torch.ops.aten.add.Tensor +class Model(torch.nn.Module): + def forward(self, input_x, input_y): + return torch.ops.aten.add.Tensor(input_x, input_y) -# Now we create a ONNX Script function that implements ``aten::add.Tensor``. -# The function name (e.g. ``custom_aten_add``) is displayed in the ONNX graph, so we recommend to use intuitive names. -custom_aten = onnxscript.values.Opset(domain="custom.aten", version=1) -# NOTE: The function signature must match the signature of the unsupported ATen operator. +# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator. # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml -# NOTE: All attributes must be annotated with type hints. -@onnxscript.script(custom_aten) -def custom_aten_add(input_x, input_y, alpha: float = 1.0): - alpha = opset18.CastLike(alpha, input_y) - input_y = opset18.Mul(input_y, alpha) - return opset18.Add(input_x, input_y) - - -# Now we have everything we need to support unsupported ATen operators. -# Let's register the ``custom_aten_add`` function to ONNX registry, and export the model to ONNX again. -onnx_registry = torch.onnx.OnnxRegistry() -onnx_registry.register_op( - namespace="aten", op_name="add", overload="Tensor", function=custom_aten_add - ) -print(f"aten::add.Tensor is supported by ONNX registry: \ - {onnx_registry.is_registered_op(namespace='aten', op_name='add', overload='Tensor')}" - ) -export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) -onnx_program = torch.onnx.dynamo_export( - aten_add_model, input_add_x, input_add_y, export_options=export_options - ) +# All attributes must be annotated with type hints. +def custom_aten_add(self, other, alpha: float = 1.0): + if alpha != 1.0: + alpha = op.CastLike(alpha, other) + other = op.Mul(other, alpha) + # To distinguish the custom implementation from the builtin one, we switch the order of the inputs + return op.Add(other, self) + + +x = torch.tensor([1.0]) +y = torch.tensor([2.0]) + +# Then we provide the custom implementation to the ONNX exporter as a ``custom_translation_table``. +onnx_program = torch.onnx.export( + Model().eval(), + (x, y), + dynamo=True, + custom_translation_table={ + torch.ops.aten.add.Tensor: custom_aten_add, + }, +) +# Optimize the ONNX graph to remove redundant nodes +onnx_program.optimize() ###################################################################### -# Now let's inspect the model and verify the model has a ``custom_aten_add`` instead of ``aten::add.Tensor``. -# The graph has one graph node for ``custom_aten_add``, and inside of it there are four function nodes, one for each -# operator, and one for constant attribute. -# - -# graph node domain is the custom domain we registered -assert onnx_program.model_proto.graph.node[0].domain == "custom.aten" -assert len(onnx_program.model_proto.graph.node) == 1 -# graph node name is the function name -assert onnx_program.model_proto.graph.node[0].op_type == "custom_aten_add" -# function node domain is empty because we use standard ONNX operators -assert onnx_program.model_proto.functions[0].node[3].domain == "" -# function node name is the standard ONNX operator name -assert onnx_program.model_proto.functions[0].node[3].op_type == "Add" +# Now let's inspect the model and verify the model is using the custom implementation. +print(onnx_program.model) ###################################################################### -# This is how ``custom_aten_add_model`` looks in the ONNX graph using Netron: -# -# .. image:: /_static/img/onnx/custom_aten_add_model.png -# :width: 70% -# :align: center -# -# Inside the ``custom_aten_add`` function, we can see the three ONNX nodes we -# used in the function (``CastLike``, ``Add``, and ``Mul``), and one ``Constant`` attribute: +# The translation is using our custom implementation: In node ``node_Add_0``, ``input_y`` now +# comes first, and ``input_x`` comes second. # -# .. image:: /_static/img/onnx/custom_aten_add_function.png -# :width: 70% -# :align: center -# -# This was all that we needed to register the new ATen operator into the ONNX Registry. -# As an additional step, we can use ONNX Runtime to run the model, and compare the results with PyTorch. -# - - -# Use ONNX Runtime to run the model, and compare the results with PyTorch -onnx_program.save("./custom_add_model.onnx") -ort_session = onnxruntime.InferenceSession( - "./custom_add_model.onnx", providers=['CPUExecutionProvider'] - ) - -def to_numpy(tensor): - return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() +# We can use ONNX Runtime to run the model and verify the results by calling +# the :class:`torch.onnx.ONNXProgram` directly on the input tensors. -onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_add_x, input_add_y) -onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} -onnxruntime_outputs = ort_session.run(None, onnxruntime_input) - -torch_outputs = aten_add_model(input_add_x, input_add_y) -torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) - -assert len(torch_outputs) == len(onnxruntime_outputs) -for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): - torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) +result = onnx_program(x, y)[0] +torch.testing.assert_close(result, torch.tensor([3.0])) ###################################################################### -# Custom operators with existing ONNX Runtime support -# --------------------------------------------------- -# -# In this case, the user creates a model with standard PyTorch operators, but the ONNX runtime -# (e.g. Microsoft's ONNX Runtime) can provide a custom implementation for that kernel, effectively replacing the -# existing implementation in the ONNX Registry. Another use case is when the user wants to use a custom implementation -# of an existing ONNX operator to fix a bug or improve performance of a specific operator. -# To achieve this, we only need to register the new implementation with the existing ATen fully qualified name. +# Using custom ONNX operators +# --------------------------- # -# In the following example, we use the ``com.microsoft.Gelu`` from ONNX Runtime, -# which is not the same ``Gelu`` from ONNX spec. Thus, we register the Gelu with -# the namespace ``com.microsoft`` and operator name ``Gelu``. +# In this case, we create a model with standard PyTorch operators, but the runtime +# (such as Microsoft's ONNX Runtime) can provide a custom implementation for that kernel, effectively replacing the +# existing implementation. # -# Before we begin, let's check whether ``aten::gelu.default`` is really supported by the ONNX registry. - -onnx_registry = torch.onnx.OnnxRegistry() -print(f"aten::gelu.default is supported by ONNX registry: \ - {onnx_registry.is_registered_op(namespace='aten', op_name='gelu', overload='default')}") - +# In the following example, we use the ``com.microsoft.Gelu`` operator provided by ONNX Runtime, +# which is not the same ``Gelu`` from ONNX spec. -###################################################################### -# In our example, ``aten::gelu.default`` operator is supported by the ONNX registry, -# so :meth:`onnx_registry.is_registered_op` returns ``True``. -class CustomGelu(torch.nn.Module): +class GeluModel(torch.nn.Module): def forward(self, input_x): return torch.ops.aten.gelu(input_x) -# com.microsoft is an official ONNX Runtime namspace -custom_ort = onnxscript.values.Opset(domain="com.microsoft", version=1) -# NOTE: The function signature must match the signature of the unsupported ATen operator. +# Create a namespace for the custom operator using ONNX Script +# ``com.microsoft`` is an official ONNX Runtime namespace +microsoft_op = onnxscript.values.Opset(domain="com.microsoft", version=1) + +# NOTE: The function signature (including parameter names) must match the signature of the unsupported PyTorch operator. # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml # NOTE: All attributes must be annotated with type hints. -@onnxscript.script(custom_ort) -def custom_aten_gelu(input_x, approximate: str = "none"): - # We know com.microsoft::Gelu is supported by ONNX Runtime - # It's only not supported by ONNX - return custom_ort.Gelu(input_x) - +# The function must be scripted using the ``@onnxscript.script()`` decorator when +# using operators from custom domains. This may be improved in future versions. +from onnxscript import FLOAT -onnx_registry = torch.onnx.OnnxRegistry() -onnx_registry.register_op( - namespace="aten", op_name="gelu", overload="default", function=custom_aten_gelu) -export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) -aten_gelu_model = CustomGelu() -input_gelu_x = torch.randn(3, 3) +@onnxscript.script(microsoft_op) +def custom_aten_gelu(self: FLOAT, approximate: str = "none") -> FLOAT: + return microsoft_op.Gelu(self) -onnx_program = torch.onnx.dynamo_export( - aten_gelu_model, input_gelu_x, export_options=export_options - ) +onnx_program = torch.onnx.export( + GeluModel().eval(), + (x,), + dynamo=True, + custom_translation_table={ + torch.ops.aten.gelu.default: custom_aten_gelu, + }, +) -###################################################################### -# Let's inspect the model and verify the model uses :func:`custom_aten_gelu` instead of -# :class:`aten::gelu`. Note the graph has one graph nodes for -# ``custom_aten_gelu``, and inside ``custom_aten_gelu``, there is a function -# node for ``Gelu`` with namespace ``com.microsoft``. -# - -# graph node domain is the custom domain we registered -assert onnx_program.model_proto.graph.node[0].domain == "com.microsoft" -# graph node name is the function name -assert onnx_program.model_proto.graph.node[0].op_type == "custom_aten_gelu" -# function node domain is the custom domain we registered -assert onnx_program.model_proto.functions[0].node[0].domain == "com.microsoft" -# function node name is the node name used in the function -assert onnx_program.model_proto.functions[0].node[0].op_type == "Gelu" +# Optimize the ONNX graph to remove redundant nodes +onnx_program.optimize() ###################################################################### -# The following diagram shows ``custom_aten_gelu_model`` ONNX graph using Netron: -# -# .. image:: /_static/img/onnx/custom_aten_gelu_model.png -# :width: 70% -# :align: center -# -# Inside the ``custom_aten_gelu`` function, we can see the ``Gelu`` node from module -# ``com.microsoft`` used in the function: -# -# .. image:: /_static/img/onnx/custom_aten_gelu_function.png -# -# That is all we need to do. As an additional step, we can use ONNX Runtime to run the model, -# and compare the results with PyTorch. +# Let's inspect the model and verify the model uses op_type ``Gelu`` +# from namespace ``com.microsoft``. # -onnx_program.save("./custom_gelu_model.onnx") -ort_session = onnxruntime.InferenceSession( - "./custom_gelu_model.onnx", providers=['CPUExecutionProvider'] - ) +print(onnx_program.model) -def to_numpy(tensor): - return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() - -onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_gelu_x) -onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} -onnxruntime_outputs = ort_session.run(None, onnxruntime_input) +###################################################################### +# Similar to the previous example, we can use ONNX Runtime to run the model and verify the results. -torch_outputs = aten_gelu_model(input_gelu_x) -torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) +result = onnx_program(x)[0] +torch.testing.assert_close(result, torch.ops.aten.gelu(x)) -assert len(torch_outputs) == len(onnxruntime_outputs) -for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): - torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) ###################################################################### -# Custom operators without ONNX Runtime support -# --------------------------------------------- -# -# In this case, the operator is not supported by any ONNX runtime, but we -# would like to use it as custom operator in ONNX graph. Therefore, we need to implement -# the operator in three places: +# Supporting a custom PyTorch operator +# ------------------------------------ # -# 1. PyTorch FX graph -# 2. ONNX Registry -# 3. ONNX Runtime +# In this case, the operator is an operator that is user implemented and registered to PyTorch. # # In the following example, we would like to use a custom operator # that takes one tensor input, and returns one output. The operator adds # the input to itself, and returns the rounded result. # -# -# Custom Ops Registration in PyTorch FX Graph (Beta) -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Firstly, we need to implement the operator in PyTorch FX graph. -# This can be done by using ``torch._custom_op``. -# - -# NOTE: This is a beta feature in PyTorch, and is subject to change. -from torch._custom_op import impl as custom_op - -@custom_op.custom_op("mylibrary::addandround_op") -def addandround_op(tensor_x: torch.Tensor) -> torch.Tensor: - ... - -@addandround_op.impl_abstract() -def addandround_op_impl_abstract(tensor_x): - return torch.empty_like(tensor_x) +# Firstly, we assume the custom operator is implemented and registered with ``torch.library.custom_op()``. +# You can refer to `Creating new custom ops in Python `_ +# for a detailed guide on how to create custom operators. -@addandround_op.impl("cpu") -def addandround_op_impl(tensor_x): - return torch.round(tensor_x + tensor_x) # add x to itself, and round the result -torch._dynamo.allow_in_graph(addandround_op) +# Define and use the operator in PyTorch +@torch.library.custom_op("mylibrary::add_and_round_op", mutates_args=()) +def add_and_round_op(input: torch.Tensor) -> torch.Tensor: + return torch.round(input + input) -class CustomFoo(torch.nn.Module): - def forward(self, tensor_x): - return addandround_op(tensor_x) -input_addandround_x = torch.randn(3) -custom_addandround_model = CustomFoo() - - -###################################################################### -# -# Custom Ops Registration in ONNX Registry -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# For the step 2 and 3, we need to implement the operator in ONNX registry. -# In this example, we will implement the operator in ONNX registry -# with the namespace ``test.customop`` and operator name ``CustomOpOne``, -# and ``CustomOpTwo``. These two ops are registered and built in -# `cpu_ops.cc `__. -# +@add_and_round_op.register_fake +def _add_and_round_op_fake(tensor_x): + return torch.empty_like(tensor_x) -custom_opset = onnxscript.values.Opset(domain="test.customop", version=1) +class AddAndRoundModel(torch.nn.Module): + def forward(self, input): + return add_and_round_op(input) -# NOTE: The function signature must match the signature of the unsupported ATen operator. -# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml -# NOTE: All attributes must be annotated with type hints. -@onnxscript.script(custom_opset) -def custom_addandround(input_x): - # The same as opset18.Add(x, x) - add_x = custom_opset.CustomOpOne(input_x, input_x) - # The same as opset18.Round(x, x) - round_x = custom_opset.CustomOpTwo(add_x) - # Cast to FLOAT to match the ONNX type - return opset18.Cast(round_x, to=1) +# Implement the custom operator in ONNX using ONNX Script +def onnx_add_and_round(input): + return op.Round(op.Add(input, input)) -onnx_registry = torch.onnx.OnnxRegistry() -onnx_registry.register_op( - namespace="mylibrary", op_name="addandround_op", overload="default", function=custom_addandround - ) -export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) -onnx_program = torch.onnx.dynamo_export( - custom_addandround_model, input_addandround_x, export_options=export_options - ) -onnx_program.save("./custom_addandround_model.onnx") +onnx_program = torch.onnx.export( + AddAndRoundModel().eval(), + (x,), + dynamo=True, + custom_translation_table={ + torch.ops.mylibrary.add_and_round_op.default: onnx_add_and_round, + }, +) +# Optimize the ONNX graph to remove redundant nodes +onnx_program.optimize() +print(onnx_program) ###################################################################### -# The ``onnx_program`` exposes the exported model as protobuf through ``onnx_program.model_proto``. -# The graph has one graph nodes for ``custom_addandround``, and inside ``custom_addandround``, -# there are two function nodes, one for each operator. +# The translation is using our custom implementation to translate the ``torch.ops.mylibrary.add_and_round_op.default`` +# operator in the :class:`torch.export.ExportedProgram`` to the ONNX operator ``Add`` and ``Round``. # -assert onnx_program.model_proto.graph.node[0].domain == "test.customop" -assert onnx_program.model_proto.graph.node[0].op_type == "custom_addandround" -assert onnx_program.model_proto.functions[0].node[0].domain == "test.customop" -assert onnx_program.model_proto.functions[0].node[0].op_type == "CustomOpOne" -assert onnx_program.model_proto.functions[0].node[1].domain == "test.customop" -assert onnx_program.model_proto.functions[0].node[1].op_type == "CustomOpTwo" +###################################################################### +# Finally we verify the results. +result = onnx_program(x)[0] +torch.testing.assert_close(result, add_and_round_op(x)) ###################################################################### -# This is how ``custom_addandround_model`` ONNX graph looks using Netron: -# -# .. image:: /_static/img/onnx/custom_addandround_model.png -# :width: 70% -# :align: center -# -# Inside the ``custom_addandround`` function, we can see the two custom operators we -# used in the function (``CustomOpOne``, and ``CustomOpTwo``), and they are from module -# ``test.customop``: -# -# .. image:: /_static/img/onnx/custom_addandround_function.png -# -# Custom Ops Registration in ONNX Runtime -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# To link your custom op library to ONNX Runtime, you need to -# compile your C++ code into a shared library and link it to ONNX Runtime. -# Follow the instructions below: -# -# 1. Implement your custom op in C++ by following -# `ONNX Runtime instructions <`https://github.com/microsoft/onnxruntime/blob/gh-pages/docs/reference/operators/add-custom-op.md>`__. -# 2. Download ONNX Runtime source distribution from -# `ONNX Runtime releases `__. -# 3. Compile and link your custom op library to ONNX Runtime, for example: -# -# .. code-block:: bash -# -# $ gcc -shared -o libcustom_op_library.so custom_op_library.cc -L /path/to/downloaded/ort/lib/ -lonnxruntime -fPIC -# -# 4. Run the model with ONNX Runtime Python API and compare the results with PyTorch. -# -# .. code-block:: python -# -# ort_session_options = onnxruntime.SessionOptions() -# -# # NOTE: Link the custom op library to ONNX Runtime and replace the path -# # with the path to your custom op library -# ort_session_options.register_custom_ops_library( -# "/path/to/libcustom_op_library.so" -# ) -# ort_session = onnxruntime.InferenceSession( -# "./custom_addandround_model.onnx", providers=['CPUExecutionProvider'], sess_options=ort_session_options) -# -# def to_numpy(tensor): -# return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() -# -# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_addandround_x) -# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} -# onnxruntime_outputs = ort_session.run(None, onnxruntime_input) -# -# torch_outputs = custom_addandround_model(input_addandround_x) -# torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) -# -# assert len(torch_outputs) == len(onnxruntime_outputs) -# for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): -# torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) -# # Conclusion # ---------- # -# Congratulations! In this tutorial, we explored the :class:`ONNXRegistry` API and -# discovered how to create custom implementations for unsupported or existing ATen operators +# Congratulations! In this tutorial, we explored the ``custom_translation_table`` option and +# discovered how to create custom implementations for unsupported or existing PyTorch operators # using ONNX Script. +# # Finally, we leveraged ONNX Runtime to execute the model and compare the results with PyTorch, # providing us with a comprehensive understanding of handling unsupported # operators in the ONNX ecosystem. diff --git a/beginner_source/onnx/onnx_toc.txt b/beginner_source/onnx/onnx_toc.txt index 674f7752c5d..ac293fbedd7 100644 --- a/beginner_source/onnx/onnx_toc.txt +++ b/beginner_source/onnx/onnx_toc.txt @@ -1,2 +1,3 @@ | 1. `Exporting a PyTorch model to ONNX `_ -| 2. `Extending the ONNX registry `_ +| 2. `Extending the ONNX exporter operator support `_ +| 3. `Export a model with control flow to ONNX `_ \ No newline at end of file diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py index b395edbaca6..0b690097200 100644 --- a/beginner_source/profiler.py +++ b/beginner_source/profiler.py @@ -2,7 +2,7 @@ Profiling your PyTorch Module ----------------------------- -**Author:** `Suraj Subramanian `_ +**Author:** `Suraj Subramanian `_ PyTorch includes a profiler API that is useful to identify the time and memory costs of various PyTorch operations in your code. Profiler can be @@ -316,6 +316,6 @@ def forward(self, input, mask): # We have seen how Profiler can be used to investigate time and memory bottlenecks in PyTorch models. # Read more about Profiler here: # -# - `Profiler Usage Recipe `__ +# - `Profiler Usage Recipe `__ # - `Profiling RPC-Based Workloads `__ # - `Profiler API Docs `__ diff --git a/beginner_source/ptcheat.rst b/beginner_source/ptcheat.rst deleted file mode 100644 index 91a05866181..00000000000 --- a/beginner_source/ptcheat.rst +++ /dev/null @@ -1,288 +0,0 @@ -PyTorch Cheat Sheet -****************************** - -Imports -========= - -General -------- - -.. code-block:: python - - import torch # root package - from torch.utils.data import Dataset, DataLoader # dataset representation and loading - -Neural Network API ------------------- - -.. code-block:: python - - import torch.autograd as autograd # computation graph - from torch import Tensor # tensor node in the computation graph - import torch.nn as nn # neural networks - import torch.nn.functional as F # layers, activations and more - import torch.optim as optim # optimizers e.g. gradient descent, ADAM, etc. - from torch.jit import script, trace # hybrid frontend decorator and tracing jit - -See `autograd `__, -`nn `__, -`functional `__ -and `optim `__ - -TorchScript and JIT -------------------- - -.. code-block:: python - - torch.jit.trace() # takes your module or function and an example - # data input, and traces the computational steps - # that the data encounters as it progresses through the model - - @script # decorator used to indicate data-dependent - # control flow within the code being traced - -See `Torchscript `__ - -ONNX ----- - -.. code-block:: python - - torch.onnx.export(model, dummy data, xxxx.proto) # exports an ONNX formatted - # model using a trained model, dummy - # data and the desired file name - - model = onnx.load("alexnet.proto") # load an ONNX model - onnx.checker.check_model(model) # check that the model - # IR is well formed - - onnx.helper.printable_graph(model.graph) # print a human readable - # representation of the graph - -See `onnx `__ - -Vision ------- - -.. code-block:: python - - from torchvision import datasets, models, transforms # vision datasets, - # architectures & - # transforms - - import torchvision.transforms as transforms # composable transforms - -See -`torchvision `__ - -Distributed Training --------------------- - -.. code-block:: python - - import torch.distributed as dist # distributed communication - from torch.multiprocessing import Process # memory sharing processes - -See `distributed `__ -and -`multiprocessing `__ - -Tensors -========= - -Creation --------- - -.. code-block:: python - - x = torch.randn(*size) # tensor with independent N(0,1) entries - x = torch.[ones|zeros](*size) # tensor with all 1's [or 0's] - x = torch.tensor(L) # create tensor from [nested] list or ndarray L - y = x.clone() # clone of x - with torch.no_grad(): # code wrap that stops autograd from tracking tensor history - requires_grad=True # arg, when set to True, tracks computation - # history for future derivative calculations - -See `tensor `__ - -Dimensionality --------------- - -.. code-block:: python - - x.size() # return tuple-like object of dimensions - x = torch.cat(tensor_seq, dim=0) # concatenates tensors along dim - y = x.view(a,b,...) # reshapes x into size (a,b,...) - y = x.view(-1,a) # reshapes x into size (b,a) for some b - y = x.transpose(a,b) # swaps dimensions a and b - y = x.permute(*dims) # permutes dimensions - y = x.unsqueeze(dim) # tensor with added axis - y = x.unsqueeze(dim=2) # (a,b,c) tensor -> (a,b,1,c) tensor - y = x.squeeze() # removes all dimensions of size 1 (a,1,b,1) -> (a,b) - y = x.squeeze(dim=1) # removes specified dimension of size 1 (a,1,b,1) -> (a,b,1) - -See `tensor `__ - -Algebra -------- - - -.. code-block:: python - - ret = A.mm(B) # matrix multiplication - ret = A.mv(x) # matrix-vector multiplication - x = x.t() # matrix transpose - -See `math -operations `__ - -GPU Usage ---------- - -.. code-block:: python - - torch.cuda.is_available # check for cuda - x = x.cuda() # move x's data from - # CPU to GPU and return new object - - x = x.cpu() # move x's data from GPU to CPU - # and return new object - - if not args.disable_cuda and torch.cuda.is_available(): # device agnostic code - args.device = torch.device('cuda') # and modularity - else: # - args.device = torch.device('cpu') # - - net.to(device) # recursively convert their - # parameters and buffers to - # device specific tensors - - x = x.to(device) # copy your tensors to a device - # (gpu, cpu) - -See `cuda `__ - -Deep Learning -============= - -.. code-block:: python - - nn.Linear(m,n) # fully connected layer from - # m to n units - - nn.ConvXd(m,n,s) # X dimensional conv layer from - # m to n channels where X⍷{1,2,3} - # and the kernel size is s - - nn.MaxPoolXd(s) # X dimension pooling layer - # (notation as above) - - nn.BatchNormXd # batch norm layer - nn.RNN/LSTM/GRU # recurrent layers - nn.Dropout(p=0.5, inplace=False) # dropout layer for any dimensional input - nn.Dropout2d(p=0.5, inplace=False) # 2-dimensional channel-wise dropout - nn.Embedding(num_embeddings, embedding_dim) # (tensor-wise) mapping from - # indices to embedding vectors - -See `nn `__ - -Loss Functions --------------- - -.. code-block:: python - - nn.X # where X is L1Loss, MSELoss, CrossEntropyLoss - # CTCLoss, NLLLoss, PoissonNLLLoss, - # KLDivLoss, BCELoss, BCEWithLogitsLoss, - # MarginRankingLoss, HingeEmbeddingLoss, - # MultiLabelMarginLoss, SmoothL1Loss, - # SoftMarginLoss, MultiLabelSoftMarginLoss, - # CosineEmbeddingLoss, MultiMarginLoss, - # or TripletMarginLoss - - -See `loss -functions `__ - -Activation Functions --------------------- - -.. code-block:: python - - nn.X # where X is ReLU, ReLU6, ELU, SELU, PReLU, LeakyReLU, - # RReLu, CELU, GELU, Threshold, Hardshrink, HardTanh, - # Sigmoid, LogSigmoid, Softplus, SoftShrink, - # Softsign, Tanh, TanhShrink, Softmin, Softmax, - # Softmax2d, LogSoftmax or AdaptiveSoftmaxWithLoss - -See `activation -functions `__ - -Optimizers ----------- - -.. code-block:: python - - opt = optim.x(model.parameters(), ...) # create optimizer - opt.step() # update weights - optim.X # where X is SGD, Adadelta, Adagrad, Adam, - # AdamW, SparseAdam, Adamax, ASGD, - # LBFGS, RMSprop or Rprop - -See `optimizers `__ - -Learning rate scheduling ------------------------- - -.. code-block:: python - - scheduler = optim.X(optimizer,...) # create lr scheduler - scheduler.step() # update lr after optimizer updates weights - optim.lr_scheduler.X # where X is LambdaLR, MultiplicativeLR, - # StepLR, MultiStepLR, ExponentialLR, - # CosineAnnealingLR, ReduceLROnPlateau, CyclicLR, - # OneCycleLR, CosineAnnealingWarmRestarts, - -See `learning rate -scheduler `__ - -Data Utilities -============== - -Datasets --------- - -.. code-block:: python - - Dataset # abstract class representing dataset - TensorDataset # labelled dataset in the form of tensors - Concat Dataset # concatenation of Datasets - -See -`datasets `__ - -Dataloaders and ``DataSamplers`` --------------------------------- - -.. code-block:: python - - DataLoader(dataset, batch_size=1, ...) # loads data batches agnostic - # of structure of individual data points - - sampler.Sampler(dataset,...) # abstract class dealing with - # ways to sample from dataset - - sampler.XSampler where ... # Sequential, Random, SubsetRandom, - # WeightedRandom, Batch, Distributed - -See -`dataloader `__ - -Also see --------- - -- `Deep Learning with PyTorch: A 60 Minute - Blitz `__ -- `PyTorch Forums `__ -- `PyTorch for Numpy - users `__ diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst index 6705b5b21a4..d65a959b957 100644 --- a/beginner_source/pytorch_with_examples.rst +++ b/beginner_source/pytorch_with_examples.rst @@ -26,6 +26,12 @@ between the network output and the true output. You can browse the individual examples at the :ref:`end of this page `. +To run the tutorials below, make sure you have the `torch`_ +and `numpy`_ packages installed. + +.. _torch: https://github.com/pytorch/pytorch +.. _numpy: https://github.com/numpy/numpy + .. contents:: Table of Contents :local: @@ -149,7 +155,7 @@ which will be optimized during learning. In TensorFlow, packages like `Keras `__, -`TensorFlow-Slim `__, +`TensorFlow-Slim `__, and `TFLearn `__ provide higher-level abstractions over raw computational graphs that are useful for building neural networks. @@ -217,6 +223,8 @@ We can easily implement this model as a Module subclass: .. includenodoc:: /beginner/examples_nn/dynamic_net.py + + .. _examples-download: Examples @@ -229,7 +237,6 @@ Tensors .. toctree:: :maxdepth: 2 - :hidden: /beginner/examples_tensor/polynomial_numpy /beginner/examples_tensor/polynomial_tensor @@ -247,7 +254,6 @@ Autograd .. toctree:: :maxdepth: 2 - :hidden: /beginner/examples_autograd/polynomial_autograd /beginner/examples_autograd/polynomial_custom_function @@ -266,7 +272,6 @@ Autograd .. toctree:: :maxdepth: 2 - :hidden: /beginner/examples_nn/polynomial_nn /beginner/examples_nn/polynomial_optim @@ -284,4 +289,4 @@ Autograd .. raw:: html -
+
\ No newline at end of file diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py index fcd33be2537..d09f9ca4491 100644 --- a/beginner_source/saving_loading_models.py +++ b/beginner_source/saving_loading_models.py @@ -153,7 +153,7 @@ # .. code:: python # # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH, weights_only=True)) # model.eval() # # .. note:: @@ -206,7 +206,7 @@ # .. code:: python # # # Model class must be defined somewhere -# model = torch.load(PATH) +# model = torch.load(PATH, weights_only=False) # model.eval() # # This save/load process uses the most intuitive syntax and involves the @@ -227,43 +227,30 @@ # normalization layers to evaluation mode before running inference. # Failing to do this will yield inconsistent inference results. # -# Export/Load Model in TorchScript Format -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Saving an Exported Program +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# One common way to do inference with a trained model is to use -# `TorchScript `__, an intermediate -# representation of a PyTorch model that can be run in Python as well as in a -# high performance environment like C++. TorchScript is actually the recommended model format -# for scaled inference and deployment. +# If you are using ``torch.export``, you can save and load your ``ExportedProgram`` using the +# ``torch.export.save()`` and ``torch.export.load()`` APIs. with the ``.pt2`` file extension: # -# .. note:: -# Using the TorchScript format, you will be able to load the exported model and -# run inference without defining the model class. -# -# **Export:** -# -# .. code:: python -# -# model_scripted = torch.jit.script(model) # Export to TorchScript -# model_scripted.save('model_scripted.pt') # Save -# -# **Load:** +# .. code-block:: python +# +# class SimpleModel(torch.nn.Module): +# def forward(self, x): +# return x + 10 # -# .. code:: python +# # Create a sample input +# sample_input = torch.randn(5) +# +# # Export the model +# exported_program = torch.export.export(SimpleModel(), sample_input) # -# model = torch.jit.load('model_scripted.pt') -# model.eval() +# # Save the exported program +# torch.export.save(exported_program, 'exported_program.pt2') # -# Remember that you must call ``model.eval()`` to set dropout and batch -# normalization layers to evaluation mode before running inference. -# Failing to do this will yield inconsistent inference results. +# # Load the exported program +# saved_exported_program = torch.export.load('exported_program.pt2') # -# For more information on TorchScript, feel free to visit the dedicated -# `tutorials `__. -# You will get familiar with the tracing conversion and learn how to -# run a TorchScript module in a `C++ environment `__. - - ###################################################################### # Saving & Loading a General Checkpoint for Inference and/or Resuming Training @@ -290,7 +277,7 @@ # model = TheModelClass(*args, **kwargs) # optimizer = TheOptimizerClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # epoch = checkpoint['epoch'] @@ -354,7 +341,7 @@ # optimizerA = TheOptimizerAClass(*args, **kwargs) # optimizerB = TheOptimizerBClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # modelA.load_state_dict(checkpoint['modelA_state_dict']) # modelB.load_state_dict(checkpoint['modelB_state_dict']) # optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) @@ -407,7 +394,7 @@ # .. code:: python # # modelB = TheModelBClass(*args, **kwargs) -# modelB.load_state_dict(torch.load(PATH), strict=False) +# modelB.load_state_dict(torch.load(PATH, weights_only=True), strict=False) # # Partially loading a model or loading a partial model are common # scenarios when transfer learning or training a new complex model. @@ -446,7 +433,7 @@ # # device = torch.device('cpu') # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location=device)) +# model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) # # When loading a model on a CPU that was trained with a GPU, pass # ``torch.device('cpu')`` to the ``map_location`` argument in the @@ -469,7 +456,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH, weights_only=True)) # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # @@ -497,7 +484,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location="cuda:0")) # Choose whatever GPU device number you want +# model.load_state_dict(torch.load(PATH, weights_only=True, map_location="cuda:0")) # Choose whatever GPU device number you want # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # diff --git a/beginner_source/t5_tutoria.rst b/beginner_source/t5_tutoria.rst new file mode 100644 index 00000000000..65de42b9320 --- /dev/null +++ b/beginner_source/t5_tutoria.rst @@ -0,0 +1,10 @@ +T5-Base Model for Summarization, Sentiment Classification, and Translation +========================================================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/t5_tutorial.py b/beginner_source/t5_tutorial.py deleted file mode 100644 index 1387975ad3d..00000000000 --- a/beginner_source/t5_tutorial.py +++ /dev/null @@ -1,458 +0,0 @@ -""" -T5-Base Model for Summarization, Sentiment Classification, and Translation -========================================================================== - -**Authors**: `Pendo Abbo `__, `Joe Cummings `__ - -""" - -###################################################################### -# Overview -# -------- -# -# This tutorial demonstrates how to use a pretrained T5 Model for summarization, sentiment classification, and -# translation tasks. We will demonstrate how to use the torchtext library to: -# -# 1. Build a text preprocessing pipeline for a T5 model -# 2. Instantiate a pretrained T5 model with base configuration -# 3. Read in the CNNDM, IMDB, and Multi30k datasets and preprocess their texts in preparation for the model -# 4. Perform text summarization, sentiment classification, and translation -# -# .. note:: -# This tutorial requires PyTorch 2.0.0 or later. -# -####################################################################### -# Data Transformation -# ------------------- -# -# The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form -# in order to perform training and inference. The following transformations are required for the T5 model: -# -# 1. Tokenize text -# 2. Convert tokens into (integer) IDs -# 3. Truncate the sequences to a specified maximum length -# 4. Add end-of-sequence (EOS) and padding token IDs -# -# T5 uses a ``SentencePiece`` model for text tokenization. Below, we use a pretrained ``SentencePiece`` model to build -# the text preprocessing pipeline using torchtext's T5Transform. Note that the transform supports both -# batched and non-batched text input (for example, one can either pass a single sentence or a list of sentences), however the T5 model expects the input to be batched. -# - -from torchtext.models import T5Transform - -padding_idx = 0 -eos_idx = 1 -max_seq_len = 512 -t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model" - -transform = T5Transform( - sp_model_path=t5_sp_model_path, - max_seq_len=max_seq_len, - eos_idx=eos_idx, - padding_idx=padding_idx, -) - -####################################################################### -# Alternatively, we can also use the transform shipped with the pretrained models that does all of the above out-of-the-box -# -# .. code-block:: -# -# from torchtext.models import T5_BASE_GENERATION -# transform = T5_BASE_GENERATION.transform() -# - - -###################################################################### -# Model Preparation -# ----------------- -# -# torchtext provides SOTA pretrained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below -# we use the pretrained T5 model with standard base configuration to perform text summarization, sentiment classification, and -# translation. For additional details on available pretrained models, see `the torchtext documentation `__ -# -# -from torchtext.models import T5_BASE_GENERATION - - -t5_base = T5_BASE_GENERATION -transform = t5_base.transform() -model = t5_base.get_model() -model.eval() - - -####################################################################### -# Using ``GenerationUtils`` -# ------------------------- -# -# We can use torchtext's ``GenerationUtils`` to produce an output sequence based on the input sequence provided. This calls on the -# model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated -# for all sequences in the batch. The ``generate`` method shown below uses greedy search to generate the sequences. Beam search and -# other decoding strategies are also supported. -# -# -from torchtext.prototype.generate import GenerationUtils - -sequence_generator = GenerationUtils(model) - - -####################################################################### -# Datasets -# -------- -# torchtext provides several standard NLP datasets. For a complete list, refer to the documentation -# at https://pytorch.org/text/stable/datasets.html. These datasets are built using composable torchdata -# datapipes and hence support standard flow-control and mapping/transformation using user defined -# functions and transforms. -# -# Below we demonstrate how to preprocess the CNNDM dataset to include the prefix necessary for the -# model to identify the task it is performing. The CNNDM dataset has a train, validation, and test -# split. Below we demo on the test split. -# -# The T5 model uses the prefix "summarize" for text summarization. For more information on task -# prefixes, please visit Appendix D of the `T5 Paper `__ -# -# .. note:: -# Using datapipes is still currently subject to a few caveats. If you wish -# to extend this example to include shuffling, multi-processing, or -# distributed learning, please see :ref:`this note ` -# for further instructions. - -from functools import partial - -from torch.utils.data import DataLoader -from torchtext.datasets import CNNDM - -cnndm_batch_size = 5 -cnndm_datapipe = CNNDM(split="test") -task = "summarize" - - -def apply_prefix(task, x): - return f"{task}: " + x[0], x[1] - - -cnndm_datapipe = cnndm_datapipe.map(partial(apply_prefix, task)) -cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size) -cnndm_datapipe = cnndm_datapipe.rows2columnar(["article", "abstract"]) -cnndm_dataloader = DataLoader(cnndm_datapipe, shuffle=True, batch_size=None) - -####################################################################### -# Alternately, we can also use batched API, for example, apply the prefix on the whole batch: -# -# .. code-block:: -# -# def batch_prefix(task, x): -# return { -# "article": [f'{task}: ' + y for y in x["article"]], -# "abstract": x["abstract"] -# } -# -# cnndm_batch_size = 5 -# cnndm_datapipe = CNNDM(split="test") -# task = 'summarize' -# -# cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size).rows2columnar(["article", "abstract"]) -# cnndm_datapipe = cnndm_datapipe.map(partial(batch_prefix, task)) -# cnndm_dataloader = DataLoader(cnndm_datapipe, batch_size=None) -# - -####################################################################### -# We can also load the IMDB dataset, which will be used to demonstrate sentiment classification using the T5 model. -# This dataset has a train and test split. Below we demo on the test split. -# -# The T5 model was trained on the SST2 dataset (also available in torchtext) for sentiment classification using the -# prefix ``sst2 sentence``. Therefore, we will use this prefix to perform sentiment classification on the IMDB dataset. -# - -from torchtext.datasets import IMDB - -imdb_batch_size = 3 -imdb_datapipe = IMDB(split="test") -task = "sst2 sentence" -labels = {"1": "negative", "2": "positive"} - - -def process_labels(labels, x): - return x[1], labels[str(x[0])] - - -imdb_datapipe = imdb_datapipe.map(partial(process_labels, labels)) -imdb_datapipe = imdb_datapipe.map(partial(apply_prefix, task)) -imdb_datapipe = imdb_datapipe.batch(imdb_batch_size) -imdb_datapipe = imdb_datapipe.rows2columnar(["text", "label"]) -imdb_dataloader = DataLoader(imdb_datapipe, batch_size=None) - -####################################################################### -# Finally, we can also load the Multi30k dataset to demonstrate English to German translation using the T5 model. -# This dataset has a train, validation, and test split. Below we demo on the test split. -# -# The T5 model uses the prefix "translate English to German" for this task. - -from torchtext.datasets import Multi30k - -multi_batch_size = 5 -language_pair = ("en", "de") -multi_datapipe = Multi30k(split="test", language_pair=language_pair) -task = "translate English to German" - -multi_datapipe = multi_datapipe.map(partial(apply_prefix, task)) -multi_datapipe = multi_datapipe.batch(multi_batch_size) -multi_datapipe = multi_datapipe.rows2columnar(["english", "german"]) -multi_dataloader = DataLoader(multi_datapipe, batch_size=None) - -####################################################################### -# Generate Summaries -# ------------------ -# -# We can put all of the components together to generate summaries on the first batch of articles in the CNNDM test set -# using a beam size of 1. -# - -batch = next(iter(cnndm_dataloader)) -input_text = batch["article"] -target = batch["abstract"] -beam_size = 1 - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(cnndm_batch_size): - print(f"Example {i+1}:\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Summarization Output -# -------------------- -# -# Summarization output might vary since we shuffle the dataloader. -# -# .. code-block:: -# -# Example 1: -# -# prediction: the 24-year-old has been tattooed for over a decade . he has landed in australia -# to start work on a new campaign . he says he is 'taking it in your stride' to be honest . -# -# target: London-based model Stephen James Hendry famed for his full body tattoo . The supermodel -# is in Sydney for a new modelling campaign . Australian fans understood to have already located -# him at his hotel . The 24-year-old heartthrob is recently single . -# -# -# Example 2: -# -# prediction: a stray pooch has used up at least three of her own after being hit by a -# car and buried in a field . the dog managed to stagger to a nearby farm, dirt-covered -# and emaciated, where she was found . she suffered a dislocated jaw, leg injuries and a -# caved-in sinus cavity -- and still requires surgery to help her breathe . -# -# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer -# and buried in a field . "She's a true miracle dog and she deserves a good life," says -# Sara Mellado, who is looking for a home for Theia . -# -# -# Example 3: -# -# prediction: mohammad Javad Zarif arrived in Iran on a sunny friday morning . he has gone -# a long way to bring Iran in from the cold and allow it to rejoin the international -# community . but there are some facts about him that are less well-known . -# -# target: Mohammad Javad Zarif has spent more time with John Kerry than any other -# foreign minister . He once participated in a takeover of the Iranian Consulate in San -# Francisco . The Iranian foreign minister tweets in English . -# -# -# Example 4: -# -# prediction: five americans were monitored for three weeks after being exposed to Ebola in -# west africa . one of the five had a heart-related issue and has been discharged but hasn't -# left the area . they are clinicians for Partners in Health, a Boston-based aid group . -# -# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . -# Another person was diagnosed with the disease and taken to hospital in Maryland . -# National Institutes of Health says the patient is in fair condition after weeks of -# treatment . -# -# -# Example 5: -# -# prediction: the student was identified during an investigation by campus police and -# the office of student affairs . he admitted to placing the noose on the tree early -# Wednesday morning . the incident is one of several recent racist events to affect -# college students . -# -# target: Student is no longer on Duke University campus and will face disciplinary -# review . School officials identified student during investigation and the person -# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on -# campus about 2 a.m. -# - - -####################################################################### -# Generate Sentiment Classifications -# ---------------------------------- -# -# Similarly, we can use the model to generate sentiment classifications on the first batch of reviews from the IMDB test set -# using a beam size of 1. -# - -batch = next(iter(imdb_dataloader)) -input_text = batch["text"] -target = batch["label"] -beam_size = 1 - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(imdb_batch_size): - print(f"Example {i+1}:\n") - print(f"input_text: {input_text[i]}\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Sentiment Output -# ---------------- -# -# .. code-block:: bash -# -# Example 1: -# -# input_text: sst2 sentence: I love sci-fi and am willing to put up with a lot. Sci-fi -# movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like -# this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). -# Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the -# background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' -# setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. -# It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character -# development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may -# treat important issues, yet not as a serious philosophy. It's really difficult to care about -# the characters here as they are not simply foolish, just missing a spark of life. Their -# actions and reactions are wooden and predictable, often painful to watch. The makers of Earth -# KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people -# would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, -# cheap, poorly edited (watching it without advert breaks really brings this home) trudging -# Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring -# him back as another actor. Jeeez. Dallas all over again. -# -# prediction: negative -# -# target: negative -# -# -# Example 2: -# -# input_text: sst2 sentence: Worth the entertainment value of a rental, especially if you like -# action movies. This one features the usual car chases, fights with the great Van Damme kick -# style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All -# of this is entertaining and competently handled but there is nothing that really blows you -# away if you've seen your share before.

The plot is made interesting by the -# inclusion of a rabbit, which is clever but hardly profound. Many of the characters are -# heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, -# the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat -# federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. -# All passably acted but again nothing special.

I thought the main villains were -# pretty well done and fairly well acted. By the end of the movie you certainly knew who the -# good guys were and weren't. There was an emotional lift as the really bad ones got their just -# deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found -# really annoying was the constant cuts to VDs daughter during the last fight scene.

-# Not bad. Not good. Passable 4. -# -# prediction: positive -# -# target: negative -# -# -# Example 3: -# -# input_text: sst2 sentence: its a totally average film with a few semi-alright action sequences -# that make the plot seem a little better and remind the viewer of the classic van dam films. -# parts of the plot don't make sense and seem to be added in to use up time. the end plot is that -# of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the -# beginning. the end scene with the flask backs don't make sense as they are added in and seem to -# have little relevance to the history of van dam's character. not really worth watching again, -# bit disappointed in the end production, even though it is apparent it was shot on a low budget -# certain shots and sections in the film are of poor directed quality. -# -# prediction: negative -# -# target: negative -# - - -####################################################################### -# Generate Translations -# --------------------- -# -# Finally, we can also use the model to generate English to German translations on the first batch of examples from the Multi30k -# test set. -# - -batch = next(iter(multi_dataloader)) -input_text = batch["english"] -target = batch["german"] - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(multi_batch_size): - print(f"Example {i+1}:\n") - print(f"input_text: {input_text[i]}\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Translation Output -# ------------------ -# -# .. code-block:: bash -# -# Example 1: -# -# input_text: translate English to German: A man in an orange hat starring at something. -# -# prediction: Ein Mann in einem orangen Hut, der an etwas schaut. -# -# target: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt. -# -# -# Example 2: -# -# input_text: translate English to German: A Boston Terrier is running on lush green grass in front of a white fence. -# -# prediction: Ein Boston Terrier läuft auf üppigem grünem Gras vor einem weißen Zaun. -# -# target: Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun. -# -# -# Example 3: -# -# input_text: translate English to German: A girl in karate uniform breaking a stick with a front kick. -# -# prediction: Ein Mädchen in Karate-Uniform bricht einen Stöck mit einem Frontkick. -# -# target: Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt. -# -# -# Example 4: -# -# input_text: translate English to German: Five people wearing winter jackets and helmets stand in the snow, with snowmobiles in the background. -# -# prediction: Fünf Menschen mit Winterjacken und Helmen stehen im Schnee, mit Schneemobilen im Hintergrund. -# -# target: Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit Schneemobilen im Hintergrund. -# -# -# Example 5: -# -# input_text: translate English to German: People are fixing the roof of a house. -# -# prediction: Die Leute fixieren das Dach eines Hauses. -# -# target: Leute Reparieren das Dach eines Hauses. -# diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py deleted file mode 100644 index 9cc5d6c8671..00000000000 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ /dev/null @@ -1,372 +0,0 @@ -""" -Text classification with the torchtext library -============================================== - -In this tutorial, we will show how to use the torchtext library to build the dataset for the text classification analysis. Users will have the flexibility to - - - Access to the raw data as an iterator - - Build data processing pipeline to convert the raw text strings into ``torch.Tensor`` that can be used to train the model - - Shuffle and iterate the data with `torch.utils.data.DataLoader `__ - - -Prerequisites -~~~~~~~~~~~~~~~~ - -A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. -For example, in the Colab environment, this can be done by adding the following line at the top of the script: - -.. code-block:: bash - - !pip install -U portalocker>=2.0.0` - -""" - - -###################################################################### -# Access to the raw dataset iterators -# ----------------------------------- -# -# The torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the ``AG_NEWS`` dataset iterators yield the raw data as a tuple of label and text. -# -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# - -import torch -from torchtext.datasets import AG_NEWS - -train_iter = iter(AG_NEWS(split="train")) - -###################################################################### -# .. code-block:: sh -# -# next(train_iter) -# >>> (3, "Fears for T N pension after talks Unions representing workers at Turner -# Newall say they are 'disappointed' after talks with stricken parent firm Federal -# Mogul.") -# -# next(train_iter) -# >>> (4, "The Race is On: Second Private Team Sets Launch Date for Human -# Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of -# rocketeers competing for the #36;10 million Ansari X Prize, a contest -# for\\privately funded suborbital space flight, has officially announced -# the first\\launch date for its manned rocket.") -# -# next(train_iter) -# >>> (4, 'Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded -# by a chemistry researcher at the University of Louisville won a grant to develop -# a method of producing better peptides, which are short chains of amino acids, the -# building blocks of proteins.') -# - - -###################################################################### -# Prepare data processing pipelines -# --------------------------------- -# -# We have revisited the very basic components of the torchtext library, including vocab, word vectors, tokenizer. Those are the basic data processing building blocks for raw text string. -# -# Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. Here we use built in -# factory function `build_vocab_from_iterator` which accepts iterator that yield list or iterator of tokens. Users can also pass any special symbols to be added to the -# vocabulary. - - -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator - -tokenizer = get_tokenizer("basic_english") -train_iter = AG_NEWS(split="train") - - -def yield_tokens(data_iter): - for _, text in data_iter: - yield tokenizer(text) - - -vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[""]) -vocab.set_default_index(vocab[""]) - -###################################################################### -# The vocabulary block converts a list of tokens into integers. -# -# .. code-block:: sh -# -# vocab(['here', 'is', 'an', 'example']) -# >>> [475, 21, 30, 5297] -# -# Prepare the text processing pipeline with the tokenizer and vocabulary. The text and label pipelines will be used to process the raw data strings from the dataset iterators. - -text_pipeline = lambda x: vocab(tokenizer(x)) -label_pipeline = lambda x: int(x) - 1 - - -###################################################################### -# The text pipeline converts a text string into a list of integers based on the lookup table defined in the vocabulary. The label pipeline converts the label into integers. For example, -# -# .. code-block:: sh -# -# text_pipeline('here is the an example') -# >>> [475, 21, 2, 30, 5297] -# label_pipeline('10') -# >>> 9 -# - - -###################################################################### -# Generate data batch and iterator -# -------------------------------- -# -# `torch.utils.data.DataLoader `__ -# is recommended for PyTorch users (a tutorial is `here `__). -# It works with a map-style dataset that implements the ``getitem()`` and ``len()`` protocols, and represents a map from indices/keys to data samples. It also works with an iterable dataset with the shuffle argument of ``False``. -# -# Before sending to the model, ``collate_fn`` function works on a batch of samples generated from ``DataLoader``. The input to ``collate_fn`` is a batch of data with the batch size in ``DataLoader``, and ``collate_fn`` processes them according to the data processing pipelines declared previously. Pay attention here and make sure that ``collate_fn`` is declared as a top level def. This ensures that the function is available in each worker. -# -# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of individual text entries. - - -from torch.utils.data import DataLoader - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def collate_batch(batch): - label_list, text_list, offsets = [], [], [0] - for _label, _text in batch: - label_list.append(label_pipeline(_label)) - processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) - text_list.append(processed_text) - offsets.append(processed_text.size(0)) - label_list = torch.tensor(label_list, dtype=torch.int64) - offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) - text_list = torch.cat(text_list) - return label_list.to(device), text_list.to(device), offsets.to(device) - - -train_iter = AG_NEWS(split="train") -dataloader = DataLoader( - train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch -) - - -###################################################################### -# Define the model -# ---------------- -# -# The model is composed of the `nn.EmbeddingBag `__ layer plus a linear layer for the classification purpose. ``nn.EmbeddingBag`` with the default mode of "mean" computes the mean value of a “bag” of embeddings. Although the text entries here have different lengths, ``nn.EmbeddingBag`` module requires no padding here since the text lengths are saved in offsets. -# -# Additionally, since ``nn.EmbeddingBag`` accumulates the average across -# the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the -# performance and memory efficiency to process a sequence of tensors. -# -# .. image:: ../_static/img/text_sentiment_ngrams_model.png -# - -from torch import nn - - -class TextClassificationModel(nn.Module): - def __init__(self, vocab_size, embed_dim, num_class): - super(TextClassificationModel, self).__init__() - self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False) - self.fc = nn.Linear(embed_dim, num_class) - self.init_weights() - - def init_weights(self): - initrange = 0.5 - self.embedding.weight.data.uniform_(-initrange, initrange) - self.fc.weight.data.uniform_(-initrange, initrange) - self.fc.bias.data.zero_() - - def forward(self, text, offsets): - embedded = self.embedding(text, offsets) - return self.fc(embedded) - - -###################################################################### -# Initiate an instance -# -------------------- -# -# The ``AG_NEWS`` dataset has four labels and therefore the number of classes is four. -# -# .. code-block:: sh -# -# 1 : World -# 2 : Sports -# 3 : Business -# 4 : Sci/Tec -# -# We build a model with the embedding dimension of 64. The vocab size is equal to the length of the vocabulary instance. The number of classes is equal to the number of labels, -# - -train_iter = AG_NEWS(split="train") -num_class = len(set([label for (label, text) in train_iter])) -vocab_size = len(vocab) -emsize = 64 -model = TextClassificationModel(vocab_size, emsize, num_class).to(device) - - -###################################################################### -# Define functions to train the model and evaluate results. -# --------------------------------------------------------- -# - - -import time - - -def train(dataloader): - model.train() - total_acc, total_count = 0, 0 - log_interval = 500 - start_time = time.time() - - for idx, (label, text, offsets) in enumerate(dataloader): - optimizer.zero_grad() - predicted_label = model(text, offsets) - loss = criterion(predicted_label, label) - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) - optimizer.step() - total_acc += (predicted_label.argmax(1) == label).sum().item() - total_count += label.size(0) - if idx % log_interval == 0 and idx > 0: - elapsed = time.time() - start_time - print( - "| epoch {:3d} | {:5d}/{:5d} batches " - "| accuracy {:8.3f}".format( - epoch, idx, len(dataloader), total_acc / total_count - ) - ) - total_acc, total_count = 0, 0 - start_time = time.time() - - -def evaluate(dataloader): - model.eval() - total_acc, total_count = 0, 0 - - with torch.no_grad(): - for idx, (label, text, offsets) in enumerate(dataloader): - predicted_label = model(text, offsets) - loss = criterion(predicted_label, label) - total_acc += (predicted_label.argmax(1) == label).sum().item() - total_count += label.size(0) - return total_acc / total_count - - -###################################################################### -# Split the dataset and run the model -# ----------------------------------- -# -# Since the original ``AG_NEWS`` has no valid dataset, we split the training -# dataset into train/valid sets with a split ratio of 0.95 (train) and -# 0.05 (valid). Here we use -# `torch.utils.data.dataset.random_split `__ -# function in PyTorch core library. -# -# `CrossEntropyLoss `__ -# criterion combines ``nn.LogSoftmax()`` and ``nn.NLLLoss()`` in a single class. -# It is useful when training a classification problem with C classes. -# `SGD `__ -# implements stochastic gradient descent method as the optimizer. The initial -# learning rate is set to 5.0. -# `StepLR `__ -# is used here to adjust the learning rate through epochs. -# - - -from torch.utils.data.dataset import random_split -from torchtext.data.functional import to_map_style_dataset - -# Hyperparameters -EPOCHS = 10 # epoch -LR = 5 # learning rate -BATCH_SIZE = 64 # batch size for training - -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR) -scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) -total_accu = None -train_iter, test_iter = AG_NEWS() -train_dataset = to_map_style_dataset(train_iter) -test_dataset = to_map_style_dataset(test_iter) -num_train = int(len(train_dataset) * 0.95) -split_train_, split_valid_ = random_split( - train_dataset, [num_train, len(train_dataset) - num_train] -) - -train_dataloader = DataLoader( - split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) -valid_dataloader = DataLoader( - split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) -test_dataloader = DataLoader( - test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) - -for epoch in range(1, EPOCHS + 1): - epoch_start_time = time.time() - train(train_dataloader) - accu_val = evaluate(valid_dataloader) - if total_accu is not None and total_accu > accu_val: - scheduler.step() - else: - total_accu = accu_val - print("-" * 59) - print( - "| end of epoch {:3d} | time: {:5.2f}s | " - "valid accuracy {:8.3f} ".format( - epoch, time.time() - epoch_start_time, accu_val - ) - ) - print("-" * 59) - - -###################################################################### -# Evaluate the model with test dataset -# ------------------------------------ -# - - -###################################################################### -# Checking the results of the test dataset… - -print("Checking the results of test dataset.") -accu_test = evaluate(test_dataloader) -print("test accuracy {:8.3f}".format(accu_test)) - - -###################################################################### -# Test on a random news -# --------------------- -# -# Use the best model so far and test a golf news. -# - - -ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} - - -def predict(text, text_pipeline): - with torch.no_grad(): - text = torch.tensor(text_pipeline(text)) - output = model(text, torch.tensor([0])) - return output.argmax(1).item() + 1 - - -ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \ - enduring the season’s worst weather conditions on Sunday at The \ - Open on his way to a closing 75 at Royal Portrush, which \ - considering the wind and the rain was a respectable showing. \ - Thursday’s first round at the WGC-FedEx St. Jude Invitational \ - was another story. With temperatures in the mid-80s and hardly any \ - wind, the Spaniard was 13 strokes better in a flawless round. \ - Thanks to his best putting performance on the PGA Tour, Rahm \ - finished with an 8-under 62 for a three-stroke lead, which \ - was even more impressive considering he’d never played the \ - front nine at TPC Southwind." - -model = model.to("cpu") - -print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)]) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.rst b/beginner_source/text_sentiment_ngrams_tutorial.rst new file mode 100644 index 00000000000..024d04056c5 --- /dev/null +++ b/beginner_source/text_sentiment_ngrams_tutorial.rst @@ -0,0 +1,12 @@ +:orphan: + +Text classification with the torchtext library +============================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py deleted file mode 100644 index 9875d8aa43a..00000000000 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ /dev/null @@ -1,384 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Preprocess custom text dataset using Torchtext -=============================================== - -**Author**: `Anupam Sharma `_ - -This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial, -we will preprocess a dataset that can be further utilized to train a sequence-to-sequence -model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning -with Neural Networks `_) but without using legacy version -of torchtext. - -In this tutorial, we will learn how to: - -* Read a dataset -* Tokenize sentence -* Apply transforms to sentence -* Perform bucket batching - -Let us assume that we need to prepare a dataset to train a model that can perform English to -German translation. We will use a tab-delimited German - English sentence pairs provided by -the `Tatoeba Project `_ which can be downloaded from -`this link `__. - -Sentence pairs for other languages can be found in `this link `\ -__. -""" - -# %% -# Setup -# ----- -# -# First, download the dataset, extract the zip, and note the path to the file `deu.txt`. -# -# Ensure that following packages are installed: -# -# * `Torchdata 0.6.0 `_ (`Installation instructions \ -# `__) -# * `Torchtext 0.15.0 `_ (`Installation instructions \ -# `__) -# * `Spacy `__ -# -# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to -# convert a sentence to list of words. Spacy is a python package used for various Natural -# Language Processing (NLP) tasks. -# -# Download the English and German models from Spacy as shown below: -# -# .. code-block:: shell -# -# python -m spacy download en_core_web_sm -# python -m spacy download de_core_news_sm -# - - -# %% -# Let us start by importing required modules: - -import torchdata.datapipes as dp -import torchtext.transforms as T -import spacy -from torchtext.vocab import build_vocab_from_iterator -eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text -de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text - -# %% -# Now we will load the dataset - -FILE_PATH = 'data/deu.txt' -data_pipe = dp.iter.IterableWrapper([FILE_PATH]) -data_pipe = dp.iter.FileOpener(data_pipe, mode='rb') -data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) - -# %% -# In the above code block, we are doing following things: -# -# 1. At line 2, we are creating an iterable of filenames -# 2. At line 3, we pass the iterable to `FileOpener` which then -# opens the file in read mode -# 3. At line 4, we call a function to parse the file, which -# again returns an iterable of tuples representing each rows -# of the tab-delimited file -# -# DataPipes can be thought of something like a dataset object, on which -# we can perform various operations. -# Check `this tutorial `_ for more details on -# DataPipes. -# -# We can verify if the iterable has the pair of sentences as shown -# below: - -for sample in data_pipe: - print(sample) - break - -# %% -# Note that we also have attribution details along with pair of sentences. We will -# write a small function to remove the attribution details: - -def removeAttribution(row): - """ - Function to keep the first two elements in a tuple - """ - return row[:2] -data_pipe = data_pipe.map(removeAttribution) - -# %% -# The `map` function at line 6 in above code block can be used to apply some function -# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains -# pair of sentences. - - -for sample in data_pipe: - print(sample) - break - -# %% -# Now, let us define few functions to perform tokenization: - -def engTokenize(text): - """ - Tokenize an English text and return a list of tokens - """ - return [token.text for token in eng.tokenizer(text)] - -def deTokenize(text): - """ - Tokenize a German text and return a list of tokens - """ - return [token.text for token in de.tokenizer(text)] - -# %% -# Above function accepts a text and returns a list of words -# as shown below: - -print(engTokenize("Have a good day!!!")) -print(deTokenize("Haben Sie einen guten Tag!!!")) - -# %% -# Building the vocabulary -# ----------------------- -# Let us consider an English sentence as the source and a German sentence as the target. -# -# Vocabulary can be considered as the set of unique words we have in the dataset. -# We will build vocabulary for both our source and target now. -# -# Let us define a function to get tokens from elements of tuples in the iterator. - - -def getTokens(data_iter, place): - """ - Function to yield tokens from an iterator. Since, our iterator contains - tuple of sentences (source and target), `place` parameters defines for which - index to return the tokens for. `place=0` for source and `place=1` for target - """ - for english, german in data_iter: - if place == 0: - yield engTokenize(english) - else: - yield deTokenize(german) - -# %% -# Now, we will build vocabulary for source: - -source_vocab = build_vocab_from_iterator( - getTokens(data_pipe,0), - min_freq=2, - specials= ['', '', '', ''], - special_first=True -) -source_vocab.set_default_index(source_vocab['']) - -# %% -# The code above, builds the vocabulary from the iterator. In the above code block: -# -# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for -# source sentences. -# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs -# less than 2 times. -# * At line 4, we specify some special tokens: -# -# * `` for start of sentence -# * `` for end of sentence -# * `` for unknown words. An example of unknown word is the one skipped because of -# `min_freq=2`. -# * `` is the padding token. While training, a model we mostly train in batches. In a -# batch, there can be sentences of different length. So, we pad the shorter sentences with -# `` token to make length of all sequences in the batch equal. -# -# * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1, -# `` index 2, and will get index 3 in the vocabulary. -# * At line 7, we set default index as index of ``. That means if some word is not in -# vocabulary, we will use `` instead of that unknown word. -# -# Similarly, we will build vocabulary for target sentences: - -target_vocab = build_vocab_from_iterator( - getTokens(data_pipe,1), - min_freq=2, - specials= ['', '', '', ''], - special_first=True -) -target_vocab.set_default_index(target_vocab['']) - -# %% -# Note that the example above shows how can we add special tokens to our vocabulary. The -# special tokens may change based on the requirements. -# -# Now, we can verify that special tokens are placed at the beginning and then other words. -# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on -# vocabulary. - -print(source_vocab.get_itos()[:9]) - -# %% -# Numericalize sentences using vocabulary -# --------------------------------------- -# After building the vocabulary, we need to convert our sentences to corresponding indices. -# Let us define some functions for this: - -def getTransform(vocab): - """ - Create transforms based on given vocabulary. The returned transform is applied to sequence - of tokens. - """ - text_tranform = T.Sequential( - ## converts the sentences to indices based on given vocabulary - T.VocabTransform(vocab=vocab), - ## Add at beginning of each sentence. 1 because the index for in vocabulary is - # 1 as seen in previous section - T.AddToken(1, begin=True), - ## Add at beginning of each sentence. 2 because the index for in vocabulary is - # 2 as seen in previous section - T.AddToken(2, begin=False) - ) - return text_tranform - -# %% -# Now, let us see how to use the above function. The function returns an object of `Transforms` -# which we will use on our sentence. Let us take a random sentence and check how the transform -# works. - -temp_list = list(data_pipe) -some_sentence = temp_list[798][0] -print("Some sentence=", end="") -print(some_sentence) -transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence)) -print("Transformed sentence=", end="") -print(transformed_sentence) -index_to_string = source_vocab.get_itos() -for index in transformed_sentence: - print(index_to_string[index], end=" ") - -# %% -# In the above code,: -# -# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1 -# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized -# sentence. Note that transforms take list of words and not a sentence. -# * At line 8, we get the mapping of index to string and then use it get the transformed -# sentence -# -# Now we will use DataPipe functions to apply transform to all our sentences. -# Let us define some more functions for this. - -def applyTransform(sequence_pair): - """ - Apply transforms to sequence of tokens in a sequence pair - """ - - return ( - getTransform(source_vocab)(engTokenize(sequence_pair[0])), - getTransform(target_vocab)(deTokenize(sequence_pair[1])) - ) -data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator -temp_list = list(data_pipe) -print(temp_list[0]) - -# %% -# Make batches (with bucket batch) -# -------------------------------- -# Generally, we train models in batches. While working for sequence to sequence models, it is -# recommended to keep the length of sequences in a batch similar. For that we will use -# `bucketbatch` function of `data_pipe`. -# -# Let us define some functions that will be used by the `bucketbatch` function. - -def sortBucket(bucket): - """ - Function to sort a given bucket. Here, we want to sort based on the length of - source and target sequence. - """ - return sorted(bucket, key=lambda x: (len(x[0]), len(x[1]))) - -# %% -# Now, we will apply the `bucketbatch` function: - -data_pipe = data_pipe.bucketbatch( - batch_size = 4, batch_num=5, bucket_num=1, - use_in_batch_shuffle=False, sort_key=sortBucket -) - -# %% -# In the above code block: -# -# * We keep batch size = 4. -# * `batch_num` is the number of batches to keep in a bucket -# * `bucket_num` is the number of buckets to keep in a pool for shuffling -# * `sort_key` specifies the function that takes a bucket and sorts it -# -# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`. -# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`. -# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: - -print(list(data_pipe)[0]) -# %% -# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`. -# For this we will write a small function: - -def separateSourceTarget(sequence_pairs): - """ - input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]` - output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))` - """ - sources,targets = zip(*sequence_pairs) - return sources,targets - -## Apply the function to each element in the iterator -data_pipe = data_pipe.map(separateSourceTarget) -print(list(data_pipe)[0]) - -# %% -# Now, we have the data as desired. -# -# Padding -# ------- -# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to -# make all the sequences in a batch of equal length. We can perform padding as follows: - -def applyPadding(pair_of_sequences): - """ - Convert sequences to tensors and apply padding - """ - return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1]))) -## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies -# padding. Here, `0` is passed to the constructor to specify the index of the `` token in the -# vocabulary. -data_pipe = data_pipe.map(applyPadding) - -# %% -# Now, we can use the index to string mapping to see how the sequence would look with tokens -# instead of indices: - -source_index_to_string = source_vocab.get_itos() -target_index_to_string = target_vocab.get_itos() - -def showSomeTransformedSentences(data_pipe): - """ - Function to show how the sentences look like after applying all transforms. - Here we try to print actual words instead of corresponding index - """ - for sources,targets in data_pipe: - if sources[0][-1] != 0: - continue # Just to visualize padding of shorter sentences - for i in range(4): - source = "" - for token in sources[i]: - source += " " + source_index_to_string[token] - target = "" - for token in targets[i]: - target += " " + target_index_to_string[token] - print(f"Source: {source}") - print(f"Traget: {target}") - break - -showSomeTransformedSentences(data_pipe) -# %% -# In the above output we can observe that the shorter sentences are padded with ``. Now, we -# can use `data_pipe` while writing our training function. -# -# Some parts of this tutorial was inspired from `this article -# `__. diff --git a/beginner_source/torchtext_custom_dataset_tutorial.rst b/beginner_source/torchtext_custom_dataset_tutorial.rst new file mode 100644 index 00000000000..9f014f3ff9a --- /dev/null +++ b/beginner_source/torchtext_custom_dataset_tutorial.rst @@ -0,0 +1,12 @@ +:orphan: + +Preprocess custom text dataset using torchtext +============================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py index 7a2b053763a..8a344d3d88a 100644 --- a/beginner_source/transfer_learning_tutorial.py +++ b/beginner_source/transfer_learning_tutorial.py @@ -98,7 +98,11 @@ dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +# We want to be able to train our model on an `accelerator `__ +# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU. + +device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu" +print(f"Using {device} device") ###################################################################### # Visualize a few images @@ -209,7 +213,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25): print(f'Best val Acc: {best_acc:4f}') # load best model weights - model.load_state_dict(torch.load(best_model_params_path)) + model.load_state_dict(torch.load(best_model_params_path, weights_only=True)) return model diff --git a/beginner_source/translation_transformer.py b/beginner_source/translation_transformer.py deleted file mode 100644 index c5553246e38..00000000000 --- a/beginner_source/translation_transformer.py +++ /dev/null @@ -1,404 +0,0 @@ -""" -Language Translation with ``nn.Transformer`` and torchtext -========================================================== - -This tutorial shows: - - How to train a translation model from scratch using Transformer. - - Use torchtext library to access `Multi30k `__ dataset to train a German to English translation model. -""" - - -###################################################################### -# Data Sourcing and Processing -# ---------------------------- -# -# `torchtext library `__ has utilities for creating datasets that can be easily -# iterated through for the purposes of creating a language translation -# model. In this example, we show how to use torchtext's inbuilt datasets, -# tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. We will use -# `Multi30k dataset from torchtext library `__ -# that yields a pair of source-target raw sentences. -# -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# - -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator -from torchtext.datasets import multi30k, Multi30k -from typing import Iterable, List - - -# We need to modify the URLs for the dataset since the links to the original dataset are broken -# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info -multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz" -multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz" - -SRC_LANGUAGE = 'de' -TGT_LANGUAGE = 'en' - -# Place-holders -token_transform = {} -vocab_transform = {} - -################################################################################### -# Create source and target language tokenizer. Make sure to install the dependencies. -# -# .. code-block:: python -# -# pip install -U torchdata -# pip install -U spacy -# python -m spacy download en_core_web_sm -# python -m spacy download de_core_news_sm - -token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm') -token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm') - - -# helper function to yield list of tokens -def yield_tokens(data_iter: Iterable, language: str) -> List[str]: - language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1} - - for data_sample in data_iter: - yield token_transform[language](data_sample[language_index[language]]) - -# Define special symbols and indices -UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3 -# Make sure the tokens are in order of their indices to properly insert them in vocab -special_symbols = ['', '', '', ''] - -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - # Training data Iterator - train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - # Create torchtext's Vocab object - vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln), - min_freq=1, - specials=special_symbols, - special_first=True) - -# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found. -# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary. -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - vocab_transform[ln].set_default_index(UNK_IDX) - -###################################################################### -# Seq2Seq Network using Transformer -# --------------------------------- -# -# Transformer is a Seq2Seq model introduced in `“Attention is all you -# need” `__ -# paper for solving machine translation tasks. -# Below, we will create a Seq2Seq network that uses Transformer. The network -# consists of three parts. First part is the embedding layer. This layer converts tensor of input indices -# into corresponding tensor of input embeddings. These embedding are further augmented with positional -# encodings to provide position information of input tokens to the model. The second part is the -# actual `Transformer `__ model. -# Finally, the output of the Transformer model is passed through linear layer -# that gives unnormalized probabilities for each token in the target language. -# - - -from torch import Tensor -import torch -import torch.nn as nn -from torch.nn import Transformer -import math -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# helper Module that adds positional encoding to the token embedding to introduce a notion of word order. -class PositionalEncoding(nn.Module): - def __init__(self, - emb_size: int, - dropout: float, - maxlen: int = 5000): - super(PositionalEncoding, self).__init__() - den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size) - pos = torch.arange(0, maxlen).reshape(maxlen, 1) - pos_embedding = torch.zeros((maxlen, emb_size)) - pos_embedding[:, 0::2] = torch.sin(pos * den) - pos_embedding[:, 1::2] = torch.cos(pos * den) - pos_embedding = pos_embedding.unsqueeze(-2) - - self.dropout = nn.Dropout(dropout) - self.register_buffer('pos_embedding', pos_embedding) - - def forward(self, token_embedding: Tensor): - return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :]) - -# helper Module to convert tensor of input indices into corresponding tensor of token embeddings -class TokenEmbedding(nn.Module): - def __init__(self, vocab_size: int, emb_size): - super(TokenEmbedding, self).__init__() - self.embedding = nn.Embedding(vocab_size, emb_size) - self.emb_size = emb_size - - def forward(self, tokens: Tensor): - return self.embedding(tokens.long()) * math.sqrt(self.emb_size) - -# Seq2Seq Network -class Seq2SeqTransformer(nn.Module): - def __init__(self, - num_encoder_layers: int, - num_decoder_layers: int, - emb_size: int, - nhead: int, - src_vocab_size: int, - tgt_vocab_size: int, - dim_feedforward: int = 512, - dropout: float = 0.1): - super(Seq2SeqTransformer, self).__init__() - self.transformer = Transformer(d_model=emb_size, - nhead=nhead, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - dim_feedforward=dim_feedforward, - dropout=dropout) - self.generator = nn.Linear(emb_size, tgt_vocab_size) - self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size) - self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size) - self.positional_encoding = PositionalEncoding( - emb_size, dropout=dropout) - - def forward(self, - src: Tensor, - trg: Tensor, - src_mask: Tensor, - tgt_mask: Tensor, - src_padding_mask: Tensor, - tgt_padding_mask: Tensor, - memory_key_padding_mask: Tensor): - src_emb = self.positional_encoding(self.src_tok_emb(src)) - tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg)) - outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, - src_padding_mask, tgt_padding_mask, memory_key_padding_mask) - return self.generator(outs) - - def encode(self, src: Tensor, src_mask: Tensor): - return self.transformer.encoder(self.positional_encoding( - self.src_tok_emb(src)), src_mask) - - def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor): - return self.transformer.decoder(self.positional_encoding( - self.tgt_tok_emb(tgt)), memory, - tgt_mask) - - -###################################################################### -# During training, we need a subsequent word mask that will prevent the model from looking into -# the future words when making predictions. We will also need masks to hide -# source and target padding tokens. Below, let's define a function that will take care of both. -# - - -def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) - return mask - - -def create_mask(src, tgt): - src_seq_len = src.shape[0] - tgt_seq_len = tgt.shape[0] - - tgt_mask = generate_square_subsequent_mask(tgt_seq_len) - src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool) - - src_padding_mask = (src == PAD_IDX).transpose(0, 1) - tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1) - return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask - - -###################################################################### -# Let's now define the parameters of our model and instantiate the same. Below, we also -# define our loss function which is the cross-entropy loss and the optimizer used for training. -# -torch.manual_seed(0) - -SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE]) -TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE]) -EMB_SIZE = 512 -NHEAD = 8 -FFN_HID_DIM = 512 -BATCH_SIZE = 128 -NUM_ENCODER_LAYERS = 3 -NUM_DECODER_LAYERS = 3 - -transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, - NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM) - -for p in transformer.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - -transformer = transformer.to(DEVICE) - -loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX) - -optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) - -###################################################################### -# Collation -# --------- -# -# As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings. -# We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network -# defined previously. Below we define our collate function that converts a batch of raw strings into batch tensors that -# can be fed directly into our model. -# - - -from torch.nn.utils.rnn import pad_sequence - -# helper function to club together sequential operations -def sequential_transforms(*transforms): - def func(txt_input): - for transform in transforms: - txt_input = transform(txt_input) - return txt_input - return func - -# function to add BOS/EOS and create tensor for input sequence indices -def tensor_transform(token_ids: List[int]): - return torch.cat((torch.tensor([BOS_IDX]), - torch.tensor(token_ids), - torch.tensor([EOS_IDX]))) - -# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices -text_transform = {} -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization - vocab_transform[ln], #Numericalization - tensor_transform) # Add BOS/EOS and create tensor - - -# function to collate data samples into batch tensors -def collate_fn(batch): - src_batch, tgt_batch = [], [] - for src_sample, tgt_sample in batch: - src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))) - tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n"))) - - src_batch = pad_sequence(src_batch, padding_value=PAD_IDX) - tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX) - return src_batch, tgt_batch - -###################################################################### -# Let's define training and evaluation loop that will be called for each -# epoch. -# - -from torch.utils.data import DataLoader - -def train_epoch(model, optimizer): - model.train() - losses = 0 - train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn) - - for src, tgt in train_dataloader: - src = src.to(DEVICE) - tgt = tgt.to(DEVICE) - - tgt_input = tgt[:-1, :] - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) - - logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - tgt_out = tgt[1:, :] - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - losses += loss.item() - - return losses / len(list(train_dataloader)) - - -def evaluate(model): - model.eval() - losses = 0 - - val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn) - - for src, tgt in val_dataloader: - src = src.to(DEVICE) - tgt = tgt.to(DEVICE) - - tgt_input = tgt[:-1, :] - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) - - logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask) - - tgt_out = tgt[1:, :] - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - losses += loss.item() - - return losses / len(list(val_dataloader)) - -###################################################################### -# Now we have all the ingredients to train our model. Let's do it! -# - -from timeit import default_timer as timer -NUM_EPOCHS = 18 - -for epoch in range(1, NUM_EPOCHS+1): - start_time = timer() - train_loss = train_epoch(transformer, optimizer) - end_time = timer() - val_loss = evaluate(transformer) - print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s")) - - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol): - src = src.to(DEVICE) - src_mask = src_mask.to(DEVICE) - - memory = model.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE) - for i in range(max_len-1): - memory = memory.to(DEVICE) - tgt_mask = (generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).to(DEVICE) - out = model.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == EOS_IDX: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, src_sentence: str): - model.eval() - src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1) - num_tokens = src.shape[0] - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten() - return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("", "").replace("", "") - - -###################################################################### -# - -print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .")) - - -###################################################################### -# References -# ---------- -# -# 1. Attention is all you need paper. -# https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf -# 2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding diff --git a/beginner_source/translation_transformer.rst b/beginner_source/translation_transformer.rst new file mode 100644 index 00000000000..892c1b73ca5 --- /dev/null +++ b/beginner_source/translation_transformer.rst @@ -0,0 +1,10 @@ +Language Translation with ``nn.Transformer`` and torchtext +========================================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py b/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py new file mode 100644 index 00000000000..740c4d4bd76 --- /dev/null +++ b/beginner_source/understanding_leaf_vs_nonleaf_tutorial.py @@ -0,0 +1,339 @@ +""" +Understanding requires_grad, retain_grad, Leaf, and Non-leaf Tensors +==================================================================== + +**Author:** `Justin Silver `__ + +This tutorial explains the subtleties of ``requires_grad``, +``retain_grad``, leaf, and non-leaf tensors using a simple example. + +Before starting, make sure you understand `tensors and how to manipulate +them `__. +A basic knowledge of `how autograd +works `__ +would also be useful. + +""" + + +###################################################################### +# Setup +# ----- +# +# First, make sure `PyTorch is +# installed `__ and then import +# the necessary libraries. +# + +import torch +import torch.nn.functional as F + + +###################################################################### +# Next, we instantiate a simple network to focus on the gradients. This +# will be an affine layer, followed by a ReLU activation, and ending with +# a MSE loss between prediction and label tensors. +# +# .. math:: +# +# \mathbf{y}_{\text{pred}} = \text{ReLU}(\mathbf{x} \mathbf{W} + \mathbf{b}) +# +# .. math:: +# +# L = \text{MSE}(\mathbf{y}_{\text{pred}}, \mathbf{y}) +# +# Note that the ``requires_grad=True`` is necessary for the parameters +# (``W`` and ``b``) so that PyTorch tracks operations involving those +# tensors. We’ll discuss more about this in a future +# `section <#requires-grad>`__. +# + +# tensor setup +x = torch.ones(1, 3) # input with shape: (1, 3) +W = torch.ones(3, 2, requires_grad=True) # weights with shape: (3, 2) +b = torch.ones(1, 2, requires_grad=True) # bias with shape: (1, 2) +y = torch.ones(1, 2) # output with shape: (1, 2) + +# forward pass +z = (x @ W) + b # pre-activation with shape: (1, 2) +y_pred = F.relu(z) # activation with shape: (1, 2) +loss = F.mse_loss(y_pred, y) # scalar loss + + +###################################################################### +# Leaf vs. non-leaf tensors +# ------------------------- +# +# After running the forward pass, PyTorch autograd has built up a `dynamic +# computational +# graph `__ +# which is shown below. This is a `Directed Acyclic Graph +# (DAG) `__ which +# keeps a record of input tensors (leaf nodes), all subsequent operations +# on those tensors, and the intermediate/output tensors (non-leaf nodes). +# The graph is used to compute gradients for each tensor starting from the +# graph roots (outputs) to the leaves (inputs) using the `chain +# rule `__ from calculus: +# +# .. math:: +# +# \mathbf{y} = \mathbf{f}_k\bigl(\mathbf{f}_{k-1}(\dots \mathbf{f}_1(\mathbf{x}) \dots)\bigr) +# +# .. math:: +# +# \frac{\partial \mathbf{y}}{\partial \mathbf{x}} = +# \frac{\partial \mathbf{f}_k}{\partial \mathbf{f}_{k-1}} \cdot +# \frac{\partial \mathbf{f}_{k-1}}{\partial \mathbf{f}_{k-2}} \cdot +# \cdots \cdot +# \frac{\partial \mathbf{f}_1}{\partial \mathbf{x}} +# +# .. figure:: /_static/img/understanding_leaf_vs_nonleaf/comp-graph-1.png +# :alt: Computational graph after forward pass +# +# Computational graph after forward pass +# +# PyTorch considers a node to be a *leaf* if it is not the result of a +# tensor operation with at least one input having ``requires_grad=True`` +# (e.g. ``x``, ``W``, ``b``, and ``y``), and everything else to be +# *non-leaf* (e.g. ``z``, ``y_pred``, and ``loss``). You can verify this +# programmatically by probing the ``is_leaf`` attribute of the tensors: +# + +# prints True because new tensors are leafs by convention +print(f"{x.is_leaf=}") + +# prints False because tensor is the result of an operation with at +# least one input having requires_grad=True +print(f"{z.is_leaf=}") + + +###################################################################### +# The distinction between leaf and non-leaf determines whether the +# tensor’s gradient will be stored in the ``grad`` property after the +# backward pass, and thus be usable for `gradient +# descent `__. We’ll cover +# this some more in the `following section <#retain-grad>`__. +# +# Let’s now investigate how PyTorch calculates and stores gradients for +# the tensors in its computational graph. +# + + +###################################################################### +# ``requires_grad`` +# ----------------- +# +# To build the computational graph which can be used for gradient +# calculation, we need to pass in the ``requires_grad=True`` parameter to +# a tensor constructor. By default, the value is ``False``, and thus +# PyTorch does not track gradients on any created tensors. To verify this, +# try not setting ``requires_grad``, re-run the forward pass, and then run +# backpropagation. You will see: +# +# :: +# +# >>> loss.backward() +# RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn +# +# This error means that autograd can’t backpropagate to any leaf tensors +# because ``loss`` is not tracking gradients. If you need to change the +# property, you can call ``requires_grad_()`` on the tensor (notice the \_ +# suffix). +# +# We can sanity check which nodes require gradient calculation, just like +# we did above with the ``is_leaf`` attribute: +# + +print(f"{x.requires_grad=}") # prints False because requires_grad=False by default +print(f"{W.requires_grad=}") # prints True because we set requires_grad=True in constructor +print(f"{z.requires_grad=}") # prints True because tensor is a non-leaf node + + +###################################################################### +# It’s useful to remember that a non-leaf tensor has +# ``requires_grad=True`` by definition, since backpropagation would fail +# otherwise. If the tensor is a leaf, then it will only have +# ``requires_grad=True`` if it was specifically set by the user. Another +# way to phrase this is that if at least one of the inputs to a tensor +# requires the gradient, then it will require the gradient as well. +# +# There are two exceptions to this rule: +# +# 1. Any ``nn.Module`` that has ``nn.Parameter`` will have +# ``requires_grad=True`` for its parameters (see +# `here `__) +# 2. Locally disabling gradient computation with context managers (see +# `here `__) +# +# In summary, ``requires_grad`` tells autograd which tensors need to have +# their gradients calculated for backpropagation to work. This is +# different from which tensors have their ``grad`` field populated, which +# is the topic of the next section. +# + + +###################################################################### +# ``retain_grad`` +# --------------- +# +# To actually perform optimization (e.g. SGD, Adam, etc.), we need to run +# the backward pass so that we can extract the gradients. +# + +loss.backward() + + +###################################################################### +# Calling ``backward()`` populates the ``grad`` field of all leaf tensors +# which had ``requires_grad=True``. The ``grad`` is the gradient of the +# loss with respect to the tensor we are probing. Before running +# ``backward()``, this attribute is set to ``None``. +# + +print(f"{W.grad=}") +print(f"{b.grad=}") + + +###################################################################### +# You might be wondering about the other tensors in our network. Let’s +# check the remaining leaf nodes: +# + +# prints all None because requires_grad=False +print(f"{x.grad=}") +print(f"{y.grad=}") + + +###################################################################### +# The gradients for these tensors haven’t been populated because we did +# not explicitly tell PyTorch to calculate their gradient +# (``requires_grad=False``). +# +# Let’s now look at an intermediate non-leaf node: +# + +print(f"{z.grad=}") + + +###################################################################### +# PyTorch returns ``None`` for the gradient and also warns us that a +# non-leaf node’s ``grad`` attribute is being accessed. Although autograd +# has to calculate intermediate gradients for backpropagation to work, it +# assumes you don’t need to access the values afterwards. To change this +# behavior, we can use the ``retain_grad()`` function on a tensor. This +# tells the autograd engine to populate that tensor’s ``grad`` after +# calling ``backward()``. +# + +# we have to re-run the forward pass +z = (x @ W) + b +y_pred = F.relu(z) +loss = F.mse_loss(y_pred, y) + +# tell PyTorch to store the gradients after backward() +z.retain_grad() +y_pred.retain_grad() +loss.retain_grad() + +# have to zero out gradients otherwise they would accumulate +W.grad = None +b.grad = None + +# backpropagation +loss.backward() + +# print gradients for all tensors that have requires_grad=True +print(f"{W.grad=}") +print(f"{b.grad=}") +print(f"{z.grad=}") +print(f"{y_pred.grad=}") +print(f"{loss.grad=}") + + +###################################################################### +# We get the same result for ``W.grad`` as before. Also note that because +# the loss is scalar, the gradient of the loss with respect to itself is +# simply ``1.0``. +# +# If we look at the state of the computational graph now, we see that the +# ``retains_grad`` attribute has changed for the intermediate tensors. By +# convention, this attribute will print ``False`` for any leaf node, even +# if it requires its gradient. +# +# .. figure:: /_static/img/understanding_leaf_vs_nonleaf/comp-graph-2.png +# :alt: Computational graph after backward pass +# +# Computational graph after backward pass +# +# If you call ``retain_grad()`` on a non-leaf node, it results in a no-op. +# If we call ``retain_grad()`` on a node that has ``requires_grad=False``, +# PyTorch actually throws an error, since it can’t store the gradient if +# it is never calculated. +# +# :: +# +# >>> x.retain_grad() +# RuntimeError: can't retain_grad on Tensor that has requires_grad=False +# + + +###################################################################### +# Summary table +# ------------- +# +# Using ``retain_grad()`` and ``retains_grad`` only make sense for +# non-leaf nodes, since the ``grad`` attribute will already be populated +# for leaf tensors that have ``requires_grad=True``. By default, these +# non-leaf nodes do not retain (store) their gradient after +# backpropagation. We can change that by rerunning the forward pass, +# telling PyTorch to store the gradients, and then performing +# backpropagation. +# +# The following table can be used as a reference which summarizes the +# above discussions. The following scenarios are the only ones that are +# valid for PyTorch tensors. +# +# +# +# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+ +# | ``is_leaf`` | ``requires_grad`` | ``retains_grad`` | ``require_grad()`` | ``retain_grad()`` | +# +================+========================+========================+===================================================+=====================================+ +# | ``True`` | ``False`` | ``False`` | sets ``requires_grad`` to ``True`` or ``False`` | no-op | +# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+ +# | ``True`` | ``True`` | ``False`` | sets ``requires_grad`` to ``True`` or ``False`` | no-op | +# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+ +# | ``False`` | ``True`` | ``False`` | no-op | sets ``retains_grad`` to ``True`` | +# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+ +# | ``False`` | ``True`` | ``True`` | no-op | no-op | +# +----------------+------------------------+------------------------+---------------------------------------------------+-------------------------------------+ +# + + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we covered when and how PyTorch computes gradients for +# leaf and non-leaf tensors. By using ``retain_grad``, we can access the +# gradients of intermediate tensors within autograd’s computational graph. +# +# If you would like to learn more about how PyTorch’s autograd system +# works, please visit the `references <#references>`__ below. If you have +# any feedback for this tutorial (improvements, typo fixes, etc.) then +# please use the `PyTorch Forums `__ and/or +# the `issue tracker `__ to +# reach out. +# + + +###################################################################### +# References +# ---------- +# +# - `A Gentle Introduction to +# torch.autograd `__ +# - `Automatic Differentiation with +# torch.autograd `__ +# - `Autograd +# mechanics `__ +# \ No newline at end of file diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py deleted file mode 100644 index 777098be946..00000000000 --- a/beginner_source/vt_tutorial.py +++ /dev/null @@ -1,293 +0,0 @@ -""" -Optimizing Vision Transformer Model for Deployment -================================================== - -`Jeff Tang `_, -`Geeta Chauhan `_ - -Vision Transformer models apply the cutting-edge attention-based -transformer models, introduced in Natural Language Processing to achieve -all kinds of the state of the art (SOTA) results, to Computer Vision -tasks. Facebook Data-efficient Image Transformers `DeiT `_ -is a Vision Transformer model trained on ImageNet for image -classification. - -In this tutorial, we will first cover what DeiT is and how to use it, -then go through the complete steps of scripting, quantizing, optimizing, -and using the model in iOS and Android apps. We will also compare the -performance of quantized, optimized and non-quantized, non-optimized -models, and show the benefits of applying quantization and optimization -to the model along the steps. - -""" - - - -############################################################################### -# What is DeiT -# --------------------- -# -# Convolutional Neural Networks (CNNs) have been the main models for image -# classification since deep learning took off in 2012, but CNNs typically -# require hundreds of millions of images for training to achieve the -# SOTA results. DeiT is a vision transformer model that requires a lot less -# data and computing resources for training to compete with the leading -# CNNs in performing image classification, which is made possible by two -# key components of of DeiT: -# -# - Data augmentation that simulates training on a much larger dataset; -# - Native distillation that allows the transformer network to learn from -# a CNN’s output. -# -# DeiT shows that Transformers can be successfully applied to computer -# vision tasks, with limited access to data and resources. For more -# details on DeiT, see the `repo `_ -# and `paper `_. -# - - -###################################################################### -# Classifying Images with DeiT -# ------------------------------- -# -# Follow the ``README.md`` at the DeiT repository for detailed information on how to -# classify images using DeiT, or for a quick test, first install the -# required packages: -# -# .. code-block:: python -# -# pip install torch torchvision timm pandas requests - -####################################################### -# To run in Google Colab, install dependencies by running the following command: -# -# .. code-block:: python -# -# !pip install timm pandas requests - -############################# -# then run the script below: - -from PIL import Image -import torch -import timm -import requests -import torchvision.transforms as transforms -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD - -print(torch.__version__) -# should be 1.8.0 - - -model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True) -model.eval() - -transform = transforms.Compose([ - transforms.Resize(256, interpolation=3), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), -]) - -img = Image.open(requests.get("https://raw.githubusercontent.com/pytorch/ios-demo-app/master/HelloWorld/HelloWorld/HelloWorld/image.png", stream=True).raw) -img = transform(img)[None,] -out = model(img) -clsidx = torch.argmax(out) -print(clsidx.item()) - - -###################################################################### -# The output should be 269, which, according to the ImageNet list of class -# index to `labels file `_, maps to ``timber -# wolf, grey wolf, gray wolf, Canis lupus``. -# -# Now that we have verified that we can use the DeiT model to classify -# images, let’s see how to modify the model so it can run on iOS and -# Android apps. -# - - -###################################################################### -# Scripting DeiT -# ---------------------- -# To use the model on mobile, we first need to script the -# model. See the `Script and Optimize recipe `_ for a -# quick overview. Run the code below to convert the DeiT model used in the -# previous step to the TorchScript format that can run on mobile. -# - - -model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True) -model.eval() -scripted_model = torch.jit.script(model) -scripted_model.save("fbdeit_scripted.pt") - - -###################################################################### -# The scripted model file ``fbdeit_scripted.pt`` of size about 346MB is -# generated. -# - - -###################################################################### -# Quantizing DeiT -# --------------------- -# To reduce the trained model size significantly while -# keeping the inference accuracy about the same, quantization can be -# applied to the model. Thanks to the transformer model used in DeiT, we -# can easily apply dynamic-quantization to the model, because dynamic -# quantization works best for LSTM and transformer models (see `here `_ -# for more details). -# -# Now run the code below: -# - -# Use 'x86' for server inference (the old 'fbgemm' is still available but 'x86' is the recommended default) and ``qnnpack`` for mobile inference. -backend = "x86" # replaced with ``qnnpack`` causing much worse inference speed for quantized model on this notebook -model.qconfig = torch.quantization.get_default_qconfig(backend) -torch.backends.quantized.engine = backend - -quantized_model = torch.quantization.quantize_dynamic(model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) -scripted_quantized_model = torch.jit.script(quantized_model) -scripted_quantized_model.save("fbdeit_scripted_quantized.pt") - - -###################################################################### -# This generates the scripted and quantized version of the model -# ``fbdeit_quantized_scripted.pt``, with size about 89MB, a 74% reduction of -# the non-quantized model size of 346MB! -# - -###################################################################### -# You can use the ``scripted_quantized_model`` to generate the same -# inference result: -# - -out = scripted_quantized_model(img) -clsidx = torch.argmax(out) -print(clsidx.item()) -# The same output 269 should be printed - -###################################################################### -# Optimizing DeiT -# --------------------- -# The final step before using the quantized and scripted -# model on mobile is to optimize it: -# - -from torch.utils.mobile_optimizer import optimize_for_mobile -optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model) -optimized_scripted_quantized_model.save("fbdeit_optimized_scripted_quantized.pt") - - -###################################################################### -# The generated ``fbdeit_optimized_scripted_quantized.pt`` file has about the -# same size as the quantized, scripted, but non-optimized model. The -# inference result remains the same. -# - - - -out = optimized_scripted_quantized_model(img) -clsidx = torch.argmax(out) -print(clsidx.item()) -# Again, the same output 269 should be printed - - -###################################################################### -# Using Lite Interpreter -# ------------------------ -# -# To see how much model size reduction and inference speed up the Lite -# Interpreter can result in, let’s create the lite version of the model. -# - -optimized_scripted_quantized_model._save_for_lite_interpreter("fbdeit_optimized_scripted_quantized_lite.ptl") -ptl = torch.jit.load("fbdeit_optimized_scripted_quantized_lite.ptl") - - -###################################################################### -# Although the lite model size is comparable to the non-lite version, when -# running the lite version on mobile, the inference speed up is expected. -# - - -###################################################################### -# Comparing Inference Speed -# --------------------------- -# -# To see how the inference speed differs for the four models - the -# original model, the scripted model, the quantized-and-scripted model, -# the optimized-quantized-and-scripted model - run the code below: -# - -with torch.autograd.profiler.profile(use_cuda=False) as prof1: - out = model(img) -with torch.autograd.profiler.profile(use_cuda=False) as prof2: - out = scripted_model(img) -with torch.autograd.profiler.profile(use_cuda=False) as prof3: - out = scripted_quantized_model(img) -with torch.autograd.profiler.profile(use_cuda=False) as prof4: - out = optimized_scripted_quantized_model(img) -with torch.autograd.profiler.profile(use_cuda=False) as prof5: - out = ptl(img) - -print("original model: {:.2f}ms".format(prof1.self_cpu_time_total/1000)) -print("scripted model: {:.2f}ms".format(prof2.self_cpu_time_total/1000)) -print("scripted & quantized model: {:.2f}ms".format(prof3.self_cpu_time_total/1000)) -print("scripted & quantized & optimized model: {:.2f}ms".format(prof4.self_cpu_time_total/1000)) -print("lite model: {:.2f}ms".format(prof5.self_cpu_time_total/1000)) - -###################################################################### -# The results running on a Google Colab are: -# -# .. code-block:: sh -# -# original model: 1236.69ms -# scripted model: 1226.72ms -# scripted & quantized model: 593.19ms -# scripted & quantized & optimized model: 598.01ms -# lite model: 600.72ms -# - - -###################################################################### -# The following results summarize the inference time taken by each model -# and the percentage reduction of each model relative to the original -# model. -# - -import pandas as pd -import numpy as np - -df = pd.DataFrame({'Model': ['original model','scripted model', 'scripted & quantized model', 'scripted & quantized & optimized model', 'lite model']}) -df = pd.concat([df, pd.DataFrame([ - ["{:.2f}ms".format(prof1.self_cpu_time_total/1000), "0%"], - ["{:.2f}ms".format(prof2.self_cpu_time_total/1000), - "{:.2f}%".format((prof1.self_cpu_time_total-prof2.self_cpu_time_total)/prof1.self_cpu_time_total*100)], - ["{:.2f}ms".format(prof3.self_cpu_time_total/1000), - "{:.2f}%".format((prof1.self_cpu_time_total-prof3.self_cpu_time_total)/prof1.self_cpu_time_total*100)], - ["{:.2f}ms".format(prof4.self_cpu_time_total/1000), - "{:.2f}%".format((prof1.self_cpu_time_total-prof4.self_cpu_time_total)/prof1.self_cpu_time_total*100)], - ["{:.2f}ms".format(prof5.self_cpu_time_total/1000), - "{:.2f}%".format((prof1.self_cpu_time_total-prof5.self_cpu_time_total)/prof1.self_cpu_time_total*100)]], - columns=['Inference Time', 'Reduction'])], axis=1) - -print(df) - -""" - Model Inference Time Reduction -0 original model 1236.69ms 0% -1 scripted model 1226.72ms 0.81% -2 scripted & quantized model 593.19ms 52.03% -3 scripted & quantized & optimized model 598.01ms 51.64% -4 lite model 600.72ms 51.43% -""" - -###################################################################### -# Learn More -# ~~~~~~~~~~~~~~~~~ -# -# - `Facebook Data-efficient Image Transformers `__ -# - `Vision Transformer with ImageNet and MNIST on iOS `__ -# - `Vision Transformer with ImageNet and MNIST on Android `__ diff --git a/compilers_index.rst b/compilers_index.rst new file mode 100644 index 00000000000..ec426cecc80 --- /dev/null +++ b/compilers_index.rst @@ -0,0 +1,224 @@ +Compilers +========= + +Explore PyTorch compilers to optimize and deploy models efficiently. +Learn about APIs like ``torch.compile`` and ``torch.export`` +that let you enhance model performance and streamline deployment +processes. +Explore advanced topics such as compiled autograd, dynamic compilation +control, as well as third-party backend solutions. + +.. warning:: + + TorchScript is no longer in active development. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. customcarditem:: + :header: torch.compile Tutorial + :card_description: Speed up your models with minimal code changes using torch.compile, the latest PyTorch compiler solution. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/torch_compile_tutorial.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: torch.compile End-to-End Tutorial + :card_description: An example of applying torch.compile to a real model, demonstrating speedups. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/torch_compile_full_example.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Compiled Autograd: Capturing a larger backward graph for torch.compile + :card_description: Learn how to use compiled autograd to capture a larger backward graph. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/compiled_autograd_tutorial.html + :tags: Model-Optimization,CUDA,torch.compile + +.. customcarditem:: + :header: Inductor CPU Backend Debugging and Profiling + :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/inductor_debug_cpu.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Dynamic Compilation Control with torch.compiler.set_stance + :card_description: Learn how to use torch.compiler.set_stance + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compiler_set_stance_tutorial.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Demonstration of torch.export flow, common challenges and the solutions to address them + :card_description: Learn how to export models for popular usecases + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_export_challenges_solutions.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: (beta) Compiling the Optimizer with torch.compile + :card_description: Speed up the optimizer using torch.compile + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/compiling_optimizer.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: (beta) Running the compiled optimizer with an LR Scheduler + :card_description: Speed up training with LRScheduler and torch.compiled optimizer + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/compiling_optimizer_lr_scheduler.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Using User-Defined Triton Kernels with ``torch.compile`` + :card_description: Learn how to use user-defined kernels with ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_user_defined_triton_kernel_tutorial.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Compile Time Caching in ``torch.compile`` + :card_description: Learn how to use compile time caching in ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_caching_tutorial.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Compile Time Caching Configurations + :card_description: Learn how to configure compile time caching in ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_caching_configuration_tutorial.html + :tags: Model-Optimization,torch.compile + +.. customcarditem:: + :header: Reducing torch.compile cold start compilation time with regional compilation + :card_description: Learn how to use regional compilation to control cold start compile time + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/regional_compilation.html + :tags: Model-Optimization,torch.compile + +.. Export +.. customcarditem:: + :header: torch.export AOTInductor Tutorial for Python runtime + :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_export_aoti_python.html + :tags: Basics,torch.export + +.. customcarditem:: + :header: Deep dive into torch.export + :card_description: Learn how to use torch.export to export PyTorch models into standardized model representations. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/torch_export_tutorial.html + :tags: Basics,torch.export + +.. ONNX +.. customcarditem:: + :header: (optional) Exporting a PyTorch model to ONNX using TorchDynamo backend and Running it using ONNX Runtime + :card_description: Build a image classifier model in PyTorch and convert it to ONNX before deploying it with ONNX Runtime. + :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png + :link: beginner/onnx/export_simple_model_to_onnx_tutorial.html + :tags: Production,ONNX,Backends + +.. customcarditem:: + :header: Extending the ONNX exporter operator support + :card_description: Demonstrate end-to-end how to address unsupported operators in ONNX. + :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png + :link: beginner/onnx/onnx_registry_tutorial.html + :tags: Production,ONNX,Backends + +.. customcarditem:: + :header: Exporting a model with control flow to ONNX + :card_description: Demonstrate how to handle control flow logic while exporting a PyTorch model to ONNX. + :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png + :link: beginner/onnx/export_control_flow_model_to_onnx_tutorial.html + :tags: Production,ONNX,Backends + +.. Code Transformations with FX +.. customcarditem:: + :header: Building a Convolution/Batch Norm fuser in FX + :card_description: Build a simple FX pass that fuses batch norm into convolution to improve performance during inference. + :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png + :link: intermediate/torch_compile_conv_bn_fuser + :tags: FX + +.. customcarditem:: + :header: Building a Simple Performance Profiler with FX + :card_description: Build a simple FX interpreter to record the runtime of op, module, and function calls and report statistics + :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png + :link: intermediate/fx_profiling_tutorial.html + :tags: FX + +.. raw:: html + +
+
+ +.. End of tutorial cards section +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: torch.compile + + intermediate/torch_compile_tutorial + intermediate/torch_compile_full_example + intermediate/compiled_autograd_tutorial + intermediate/inductor_debug_cpu + recipes/torch_compiler_set_stance_tutorial + recipes/torch_export_challenges_solutions + recipes/compiling_optimizer + recipes/compiling_optimizer_lr_scheduler + recipes/torch_compile_user_defined_triton_kernel_tutorial + recipes/torch_compile_caching_tutorial + recipes/regional_compilation + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: torch.export + + intermediate/torch_export_tutorial + recipes/torch_export_aoti_python + recipes/torch_export_challenges_solutions + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: ONNX + + beginner/onnx/intro_onnx + beginner/onnx/export_simple_model_to_onnx_tutorial + beginner/onnx/onnx_registry_tutorial + beginner/onnx/export_control_flow_model_to_onnx_tutorial + +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Code Transforms with FX + + intermediate/torch_compile_conv_bn_fuser + intermediate/fx_profiling_tutorial diff --git a/conf.py b/conf.py index f0f4905844c..0cb34fd02b3 100644 --- a/conf.py +++ b/conf.py @@ -29,33 +29,89 @@ # import os import sys -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('./.jenkins')) -import pytorch_sphinx_theme -import torch -import numpy -import gc + +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("./.jenkins")) +import pytorch_sphinx_theme2 + +html_theme = "pytorch_sphinx_theme2" +html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()] +import distutils.file_util import glob import random -import shutil -from custom_directives import IncludeDirective, GalleryItemDirective, CustomGalleryItemDirective, CustomCalloutItemDirective, CustomCardItemDirective -import distutils.file_util import re -from get_sphinx_filenames import SPHINX_SHOULD_RUN +import shutil +from pathlib import Path + import pandocfilters -import pypandoc import plotly.io as pio -pio.renderers.default = 'sphinx_gallery' +import pypandoc +import torch +from get_sphinx_filenames import SPHINX_SHOULD_RUN + +pio.renderers.default = "sphinx_gallery" +import multiprocessing + +import sphinx_gallery.gen_rst +from redirects import redirects + +# Monkey patch sphinx gallery to run each example in an isolated process so that +# we don't need to worry about examples changing global state. +# +# Alt option 1: Parallelism was added to sphinx gallery (a later version that we +# are not using yet) using joblib, but it seems to result in errors for us, and +# it has no effect if you set parallel = 1 (it will not put each file run into +# its own process and run singly) so you need parallel >= 2, and there may be +# tutorials that cannot be run in parallel. +# +# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI +# but with shard sizes of 1), but running sphinx gallery for each file has a +# ~5min overhead, resulting in the entire suite taking ~2x time +def call_fn(func, args, kwargs, result_queue): + try: + result = func(*args, **kwargs) + result_queue.put((True, result)) + except Exception as e: + result_queue.put((False, str(e))) + + +def call_in_subprocess(func): + def wrapper(*args, **kwargs): + result_queue = multiprocessing.Queue() + p = multiprocessing.Process( + target=call_fn, args=(func, args, kwargs, result_queue) + ) + p.start() + p.join() + success, result = result_queue.get() + if success: + return result + else: + raise RuntimeError(f"Error in subprocess: {result}") + + return wrapper + + +# Windows does not support multiprocessing with fork and mac has issues with +# fork so we do not monkey patch sphinx gallery to run in subprocesses. +if ( + os.getenv("TUTORIALS_ISOLATE_BUILD", "1") == "1" + and not sys.platform.startswith("win") + and not sys.platform == "darwin" +): + sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess( + sphinx_gallery.gen_rst.generate_file_rst + ) try: import torchvision except ImportError: import warnings + warnings.warn('unable to load "torchvision" package') -import pytorch_sphinx_theme -rst_epilog =""" +rst_epilog = """ .. |edit| image:: /_static/pencil-16.png :width: 16px :height: 16px @@ -67,54 +123,139 @@ # # needs_sphinx = '1.0' +html_meta = { + "description": "Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!", + "keywords": "PyTorch, tutorials, Getting Started, deep learning, AI", + "author": "PyTorch Contributors", +} + # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinxcontrib.katex', - 'sphinx.ext.intersphinx', - 'sphinx_copybutton', - 'sphinx_gallery.gen_gallery', - 'sphinx_design', + "sphinxcontrib.katex", + "sphinx.ext.intersphinx", + "sphinx_copybutton", + "sphinx_gallery.gen_gallery", + "sphinx_design", + "sphinx_sitemap", + "sphinx_reredirects", + "sphinxcontrib.mermaid", ] intersphinx_mapping = { - "torch": ("https://pytorch.org/docs/stable/", None), - "tensordict": ("https://pytorch.github.io/tensordict/", None), - "torchrl": ("https://pytorch.org/rl/", None), - "torchaudio": ("https://pytorch.org/audio/stable/", None), - "torchtext": ("https://pytorch.org/text/stable/", None), - "torchvision": ("https://pytorch.org/vision/stable/", None), + "torch": ("https://docs.pytorch.org/docs/stable/", None), + "tensordict": ("https://docs.pytorch.org/tensordict/stable", None), + "torchrl": ("https://docs.pytorch.org/rl/stable", None), + "torchaudio": ("https://docs.pytorch.org/audio/stable/", None), + "torchtext": ("https://docs.pytorch.org/text/stable/", None), + "torchvision": ("https://docs.pytorch.org/vision/stable/", None), } -# -- Sphinx-gallery configuration -------------------------------------------- +html_meta = { + "description": "Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!", + "keywords": "PyTorch, tutorials, Getting Started, deep learning, AI", + "author": "PyTorch Contributors", +} -def reset_seeds(gallery_conf, fname): - torch.cuda.empty_cache() - torch.manual_seed(42) - torch.set_default_device(None) - random.seed(10) - numpy.random.seed(10) - gc.collect() + + +# -- Sphinx-gallery configuration -------------------------------------------- sphinx_gallery_conf = { - 'examples_dirs': ['beginner_source', 'intermediate_source', - 'advanced_source', 'recipes_source', 'prototype_source'], - 'gallery_dirs': ['beginner', 'intermediate', 'advanced', 'recipes', 'prototype'], - 'filename_pattern': re.compile(SPHINX_SHOULD_RUN), - 'promote_jupyter_magic': True, - 'backreferences_dir': None, - 'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n" - "# https://pytorch.org/tutorials/beginner/colab\n" - "%matplotlib inline"), - 'reset_modules': (reset_seeds), - 'ignore_pattern': r'_torch_export_nightly_tutorial.py', - 'pypandoc': {'extra_args': ['--mathjax', '--toc'], - 'filters': ['.jenkins/custom_pandoc_filter.py'], + "examples_dirs": [ + "beginner_source", + "intermediate_source", + "advanced_source", + "recipes_source", + "unstable_source", + ], + "gallery_dirs": ["beginner", "intermediate", "advanced", "recipes", "unstable"], + "filename_pattern": re.compile(SPHINX_SHOULD_RUN), + "promote_jupyter_magic": True, + "backreferences_dir": None, + "write_computation_times": True, + "download_all_examples": False, + "show_signature": False, + "first_notebook_cell": ( + "# For tips on running notebooks in Google Colab, see\n" + "# https://docs.pytorch.org/tutorials/beginner/colab\n" + "%matplotlib inline" + ), + "ignore_pattern": r"_torch_export_nightly_tutorial.py", + "pypandoc": { + "extra_args": ["--mathjax", "--toc"], + "filters": [".jenkins/custom_pandoc_filter.py"], + }, +} + +html_additional_pages = { + "404": "404.html", +} + + +html_baseurl = "https://docs.pytorch.org/tutorials/" # needed for sphinx-sitemap +sitemap_locales = [None] +sitemap_excludes = [ + "search.html", + "genindex.html", +] +sitemap_url_scheme = "{link}" + +html_theme_options = { + "navigation_with_keys": False, + "analytics_id": "GTM-T8XT4PS", + "logo": { + "text": "", }, + "icon_links": [ + { + "name": "X", + "url": "https://x.com/PyTorch", + "icon": "fa-brands fa-x-twitter", + }, + { + "name": "GitHub", + "url": "https://github.com/pytorch/tutorials", + "icon": "fa-brands fa-github", + }, + { + "name": "Discourse", + "url": "https://dev-discuss.pytorch.org/", + "icon": "fa-brands fa-discourse", + }, + { + "name": "PyPi", + "url": "https://pypi.org/project/torch/", + "icon": "fa-brands fa-python", + }, + ], + "use_edit_page_button": True, + "header_links_before_dropdown": 9, + "navbar_start": ["pytorch_version"], + "navbar_center": "navbar-nav", + "display_version": True, + "pytorch_project": "tutorials", + "canonical_url": "https://docs.pytorch.org/tutorials/", } -if os.getenv('GALLERY_PATTERN'): +theme_variables = pytorch_sphinx_theme2.get_theme_variables() + +html_context = { + "theme_variables": theme_variables, + "display_github": True, + "github_url": "https://github.com", + "github_user": "pytorch", + "github_repo": "tutorials", + "feedback_url": "https://github.com/pytorch/tutorials", + "github_version": "main", + "doc_path": ".", + "library_links": theme_variables.get("library_links", []), + #"pytorch_project": "tutorials", +} + + +if os.getenv("GALLERY_PATTERN"): # GALLERY_PATTERN is to be used when you want to work on a single # tutorial. Previously this was fed into filename_pattern, but # if you do that, you still end up parsing all of the other Python @@ -122,47 +263,47 @@ def reset_seeds(gallery_conf, fname): # ignore_pattern also skips parsing. # See https://github.com/sphinx-gallery/sphinx-gallery/issues/721 # for a more detailed description of the issue. - sphinx_gallery_conf['ignore_pattern'] = r'/(?!' + re.escape(os.getenv('GALLERY_PATTERN')) + r')[^/]+$' + sphinx_gallery_conf["ignore_pattern"] = ( + r"/(?!" + re.escape(os.getenv("GALLERY_PATTERN")) + r")[^/]+$" + ) -for i in range(len(sphinx_gallery_conf['examples_dirs'])): - gallery_dir = sphinx_gallery_conf['gallery_dirs'][i] - source_dir = sphinx_gallery_conf['examples_dirs'][i] - # Create gallery dirs if it doesn't exist - try: - os.mkdir(gallery_dir) - except OSError: - pass +for i in range(len(sphinx_gallery_conf["examples_dirs"])): + gallery_dir = Path(sphinx_gallery_conf["gallery_dirs"][i]) + source_dir = Path(sphinx_gallery_conf["examples_dirs"][i]) # Copy rst files from source dir to gallery dir - for f in glob.glob(os.path.join(source_dir, '*.rst')): - distutils.file_util.copy_file(f, gallery_dir, update=True) - + for f in source_dir.rglob("*.rst"): + f_dir = Path(f).parent + gallery_subdir_path = gallery_dir / f_dir.relative_to(source_dir) + gallery_subdir_path.mkdir(parents=True, exist_ok=True) + distutils.file_util.copy_file(f, gallery_subdir_path, update=True) # Add any paths that contain templates here, relative to this directory. - - -templates_path = ['_templates'] +templates_path = [ + "_templates", + os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"), +] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'PyTorch Tutorials' -copyright = '2024, PyTorch' -author = 'PyTorch contributors' +project = "PyTorch Tutorials" +copyright = "2024, PyTorch" +author = "PyTorch contributors" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = str(torch.__version__) +version = "v" + str(torch.__version__) # The full version, including alpha/beta/rc tags. release = str(torch.__version__) @@ -171,17 +312,30 @@ def reset_seeds(gallery_conf, fname): # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] -exclude_patterns += sphinx_gallery_conf['examples_dirs'] -exclude_patterns += ['*/index.rst'] +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + "src/pytorch-sphinx-theme/docs*", +] +exclude_patterns += sphinx_gallery_conf["examples_dirs"] +exclude_patterns += ["*/index.rst"] + + +# Handling for HuggingFace Hub jinja templates +def handle_jinja_templates(app, docname, source): + if "huggingface_hub/templates" in docname: + # Replace Jinja templates with quoted strings + source[0] = re.sub(r"(\{\{.*?\}\})", r'"\1"', source[0]) + # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -209,7 +363,7 @@ def reset_seeds(gallery_conf, fname): # # Add any paths that contain custom static files (such as style sheets) here, # # relative to this directory. They are copied after the builtin static files, # # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # # Custom sidebar templates, maps document names to template names. # html_sidebars = { @@ -218,23 +372,10 @@ def reset_seeds(gallery_conf, fname): # } -html_theme = 'pytorch_sphinx_theme' -html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] -html_logo = '_static/img/pytorch-logo-dark.svg' -html_theme_options = { - 'pytorch_project': 'tutorials', - 'collapse_navigation': False, - 'display_version': True, - 'navigation_with_keys': True, - 'logo_only': False, - 'analytics_id': 'GTM-T8XT4PS', -} - - # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'PyTorchTutorialsdoc' +htmlhelp_basename = "PyTorchTutorialsdoc" # -- Options for LaTeX output --------------------------------------------- @@ -243,15 +384,12 @@ def reset_seeds(gallery_conf, fname): # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -261,8 +399,13 @@ def reset_seeds(gallery_conf, fname): # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'PyTorchTutorials.tex', 'PyTorch Tutorials', - 'Sasank, PyTorch contributors', 'manual'), + ( + master_doc, + "PyTorchTutorials.tex", + "PyTorch Tutorials", + "Sasank, PyTorch contributors", + "manual", + ), ] @@ -270,10 +413,7 @@ def reset_seeds(gallery_conf, fname): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'pytorchtutorials', 'PyTorch Tutorials', - [author], 1) -] +man_pages = [(master_doc, "pytorchtutorials", "PyTorch Tutorials", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -282,36 +422,48 @@ def reset_seeds(gallery_conf, fname): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'PyTorchTutorials', 'PyTorch Tutorials', - author, 'PyTorchTutorials', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "PyTorchTutorials", + "PyTorch Tutorials", + author, + "PyTorchTutorials", + "One line description of project.", + "Miscellaneous", + ), ] html_css_files = [ - 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css', - 'css/custom.css', - 'css/custom2.css' - ] + "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css", +] + + +def html_page_context(app, pagename, templatename, context, doctree): + # Check if the page is in gallery directories + for gallery_dir in sphinx_gallery_conf["gallery_dirs"]: + if pagename.startswith(gallery_dir): + # Get corresponding examples directory + examples_dir = sphinx_gallery_conf["examples_dirs"][ + sphinx_gallery_conf["gallery_dirs"].index(gallery_dir) + ] + + # Calculate relative path within the gallery + rel_path = ( + pagename[len(gallery_dir) + 1 :] if pagename != gallery_dir else "" + ) + + # Check for .py file in examples directory + py_path = os.path.join(app.srcdir, examples_dir, rel_path + ".py") + + # If a .py file exists, this page was generated from Python + if os.path.exists(py_path): + context["display_github"] = False + return + + # Enable for all other pages + context["display_github"] = True + def setup(app): - # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value - # and can be moved outside of this function (and the setup(app) function - # can be deleted). - #html_css_files = [ - # 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css', - # 'css/custom.css' - #] - # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is - # `add_stylesheet` (deprecated in 1.8). - #add_css = getattr(app, 'add_css_file', app.add_stylesheet) - #for css_file in html_css_files: - # add_css(css_file) - # Custom CSS - #app.add_stylesheet('css/pytorch_theme.css') - # app.add_stylesheet('https://fonts.googleapis.com/css?family=Lato') - # Custom directives - app.add_directive('includenodoc', IncludeDirective) - app.add_directive('galleryitem', GalleryItemDirective) - app.add_directive('customgalleryitem', CustomGalleryItemDirective) - app.add_directive('customcarditem', CustomCardItemDirective) - app.add_directive('customcalloutitem', CustomCalloutItemDirective) + app.connect("source-read", handle_jinja_templates) + app.connect("html-page-context", html_page_context) diff --git a/custom_directives.py b/custom_directives.py deleted file mode 100644 index 388aa262e6e..00000000000 --- a/custom_directives.py +++ /dev/null @@ -1,350 +0,0 @@ -from docutils.parsers.rst import Directive, directives -from docutils.statemachine import StringList -from docutils import nodes -import re -import os -import sphinx_gallery - -try: - FileNotFoundError -except NameError: - FileNotFoundError = IOError - - -class IncludeDirective(Directive): - """Include source file without docstring at the top of file. - - Implementation just replaces the first docstring found in file - with '' once. - - Example usage: - - .. includenodoc:: /beginner/examples_tensor/two_layer_net_tensor.py - - """ - - # defines the parameter the directive expects - # directives.unchanged means you get the raw value from RST - required_arguments = 1 - optional_arguments = 0 - final_argument_whitespace = True - has_content = False - add_index = False - - docstring_pattern = r'"""(?P(?:.|[\r\n])*?)"""\n' - docstring_regex = re.compile(docstring_pattern) - - def run(self): - document = self.state.document - env = document.settings.env - rel_filename, filename = env.relfn2path(self.arguments[0]) - - try: - text = open(filename).read() - text_no_docstring = self.docstring_regex.sub('', text, count=1) - - code_block = nodes.literal_block(text=text_no_docstring) - return [code_block] - except FileNotFoundError as e: - print(e) - return [] - - -class GalleryItemDirective(Directive): - """ - Create a sphinx gallery thumbnail for insertion anywhere in docs. - - Optionally, you can specify the custom figure and intro/tooltip for the - thumbnail. - - Example usage: - - .. galleryitem:: intermediate/char_rnn_generation_tutorial.py - :figure: _static/img/char_rnn_generation.png - :intro: Put your custom intro here. - - If figure is specified, a thumbnail will be made out of it and stored in - _static/thumbs. Therefore, consider _static/thumbs as a 'built' directory. - """ - - required_arguments = 1 - optional_arguments = 0 - final_argument_whitespace = True - option_spec = {'figure': directives.unchanged, - 'intro': directives.unchanged} - has_content = False - add_index = False - - def run(self): - args = self.arguments - fname = args[-1] - - env = self.state.document.settings.env - fname, abs_fname = env.relfn2path(fname) - basename = os.path.basename(fname) - dirname = os.path.dirname(fname) - - try: - if 'intro' in self.options: - intro = self.options['intro'][:195] + '...' - else: - _, blocks = sphinx_gallery.gen_rst.split_code_and_text_blocks(abs_fname) - intro, _ = sphinx_gallery.gen_rst.extract_intro_and_title(abs_fname, blocks[0][1]) - - thumbnail_rst = '' - #sphinx_gallery.backreferences._thumbnail_div( - # dirname, basename, intro) - - if 'figure' in self.options: - rel_figname, figname = env.relfn2path(self.options['figure']) - save_figname = os.path.join('_static/thumbs/', - os.path.basename(figname)) - - try: - os.makedirs('_static/thumbs') - except OSError: - pass - - sphinx_gallery.gen_rst.scale_image(figname, save_figname, - 400, 280) - # replace figure in rst with simple regex - thumbnail_rst = re.sub(r'..\sfigure::\s.*\.png', - '.. figure:: /{}'.format(save_figname), - thumbnail_rst) - - thumbnail = StringList(thumbnail_rst.split('\n')) - thumb = nodes.paragraph() - self.state.nested_parse(thumbnail, self.content_offset, thumb) - - return [thumb] - except FileNotFoundError as e: - print(e) - return [] - - -GALLERY_TEMPLATE = """ -.. raw:: html - -
- -.. only:: html - - .. figure:: {thumbnail} - - {description} - -.. raw:: html - -
-""" - - -class CustomGalleryItemDirective(Directive): - """Create a sphinx gallery style thumbnail. - - tooltip and figure are self explanatory. Description could be a link to - a document like in below example. - - Example usage: - - .. customgalleryitem:: - :tooltip: I am writing this tutorial to focus specifically on NLP for people who have never written code in any deep learning framework - :figure: /_static/img/thumbnails/babel.jpg - :description: :doc:`/beginner/deep_learning_nlp_tutorial` - - If figure is specified, a thumbnail will be made out of it and stored in - _static/thumbs. Therefore, consider _static/thumbs as a 'built' directory. - """ - - required_arguments = 0 - optional_arguments = 0 - final_argument_whitespace = True - option_spec = {'tooltip': directives.unchanged, - 'figure': directives.unchanged, - 'description': directives.unchanged} - - has_content = False - add_index = False - - def run(self): - try: - if 'tooltip' in self.options: - tooltip = self.options['tooltip'][:195] + '...' - else: - raise ValueError('tooltip not found') - - if 'figure' in self.options: - env = self.state.document.settings.env - rel_figname, figname = env.relfn2path(self.options['figure']) - thumbnail = os.path.join('_static/thumbs/', os.path.basename(figname)) - - try: - os.makedirs('_static/thumbs') - except FileExistsError: - pass - - sphinx_gallery.gen_rst.scale_image(figname, thumbnail, 400, 280) - else: - thumbnail = '_static/img/thumbnails/default.png' - - if 'description' in self.options: - description = self.options['description'] - else: - raise ValueError('description not doc found') - - except FileNotFoundError as e: - print(e) - return [] - except ValueError as e: - print(e) - raise - return [] - - thumbnail_rst = GALLERY_TEMPLATE.format(tooltip=tooltip, - thumbnail=thumbnail, - description=description) - thumbnail = StringList(thumbnail_rst.split('\n')) - thumb = nodes.paragraph() - self.state.nested_parse(thumbnail, self.content_offset, thumb) - return [thumb] - - -class CustomCardItemDirective(Directive): - option_spec = {'header': directives.unchanged, - 'image': directives.unchanged, - 'link': directives.unchanged, - 'card_description': directives.unchanged, - 'tags': directives.unchanged} - - def run(self): - try: - if 'header' in self.options: - header = self.options['header'] - else: - raise ValueError('header not doc found') - - if 'image' in self.options: - image = "" - else: - image = '_static/img/thumbnails/default.png' - - if 'link' in self.options: - link = self.options['link'] - else: - link = '' - - if 'card_description' in self.options: - card_description = self.options['card_description'] - else: - card_description = '' - - if 'tags' in self.options: - tags = self.options['tags'] - else: - tags = '' - - except FileNotFoundError as e: - print(e) - return [] - except ValueError as e: - print(e) - raise - return [] - - card_rst = CARD_TEMPLATE.format(header=header, - image=image, - link=link, - card_description=card_description, - tags=tags) - card_list = StringList(card_rst.split('\n')) - card = nodes.paragraph() - self.state.nested_parse(card_list, self.content_offset, card) - return [card] - - -CARD_TEMPLATE = """ -.. raw:: html - - -""" - -class CustomCalloutItemDirective(Directive): - option_spec = {'header': directives.unchanged, - 'description': directives.unchanged, - 'button_link': directives.unchanged, - 'button_text': directives.unchanged} - - def run(self): - try: - if 'description' in self.options: - description = self.options['description'] - else: - description = '' - - if 'header' in self.options: - header = self.options['header'] - else: - raise ValueError('header not doc found') - - if 'button_link' in self.options: - button_link = self.options['button_link'] - else: - button_link = '' - - if 'button_text' in self.options: - button_text = self.options['button_text'] - else: - button_text = '' - - except FileNotFoundError as e: - print(e) - return [] - except ValueError as e: - print(e) - raise - return [] - - callout_rst = CALLOUT_TEMPLATE.format(description=description, - header=header, - button_link=button_link, - button_text=button_text) - callout_list = StringList(callout_rst.split('\n')) - callout = nodes.paragraph() - self.state.nested_parse(callout_list, self.content_offset, callout) - return [callout] - -CALLOUT_TEMPLATE = """ -.. raw:: html - -
-
-

{header}

-

{description}

- {button_text} -
-
-""" diff --git a/deep-dive.rst b/deep-dive.rst new file mode 100644 index 00000000000..89a18d48e29 --- /dev/null +++ b/deep-dive.rst @@ -0,0 +1,163 @@ +:orphan: + +Deep Dive +========= + +Focused on enhancing model performance, this section includes +tutorials on profiling, hyperparameter tuning, quantization, +and other techniques to optimize PyTorch models for better efficiency +and speed. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add tutorial cards below this line +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Learn how to profile a PyTorch application + :link: beginner/profiler.html + :image: _static/img/thumbnails/cropped/pytorch-logo.png + :tags: Profiling + +.. customcarditem:: + :header: Parametrizations Tutorial + :card_description: Learn how to use torch.nn.utils.parametrize to put constraints on your parameters (e.g. make them orthogonal, symmetric positive definite, low-rank...) + :image: _static/img/thumbnails/cropped/parametrizations.png + :link: intermediate/parametrizations.html + :tags: Model-Optimization,Best-Practice + +.. customcarditem:: + :header: Pruning Tutorial + :card_description: Learn how to use torch.nn.utils.prune to sparsify your neural networks, and how to extend it to implement your own custom pruning technique. + :image: _static/img/thumbnails/cropped/Pruning-Tutorial.png + :link: intermediate/pruning_tutorial.html + :tags: Model-Optimization,Best-Practice + + +.. customcarditem:: + :header: Inductor CPU Backend Debugging and Profiling + :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/inductor_debug_cpu.html + :tags: Model-Optimization,inductor + +.. customcarditem:: + :header: (beta) Implementing High-Performance Transformers with SCALED DOT PRODUCT ATTENTION + :card_description: This tutorial explores the new torch.nn.functional.scaled_dot_product_attention and how it can be used to construct Transformer components. + :image: _static/img/thumbnails/cropped/pytorch-logo.png + :link: intermediate/scaled_dot_product_attention_tutorial.html + :tags: Model-Optimization,Attention,Transformer + +.. customcarditem:: + :header: Knowledge Distillation in Convolutional Neural Networks + :card_description: Learn how to improve the accuracy of lightweight models using more powerful models as teachers. + :image: _static/img/thumbnails/cropped/knowledge_distillation_pytorch_logo.png + :link: beginner/knowledge_distillation_tutorial.html + :tags: Model-Optimization,Image/Video + +.. Frontend APIs +.. customcarditem:: + :header: (beta) Channels Last Memory Format in PyTorch + :card_description: Get an overview of Channels Last memory format and understand how it is used to order NCHW tensors in memory preserving dimensions. + :image: _static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png + :link: intermediate/memory_format_tutorial.html + :tags: Memory-Format,Best-Practice,Frontend-APIs + +.. customcarditem:: + :header: Forward-mode Automatic Differentiation + :card_description: Learn how to use forward-mode automatic differentiation. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/forward_ad_usage.html + :tags: Frontend-APIs + +.. customcarditem:: + :header: Jacobians, Hessians, hvp, vhp, and more + :card_description: Learn how to compute advanced autodiff quantities using torch.func + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/jacobians_hessians.html + :tags: Frontend-APIs + +.. customcarditem:: + :header: Model Ensembling + :card_description: Learn how to ensemble models using torch.vmap + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/ensembling.html + :tags: Frontend-APIs + +.. customcarditem:: + :header: Per-Sample-Gradients + :card_description: Learn how to compute per-sample-gradients using torch.func + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/per_sample_grads.html + :tags: Frontend-APIs + +.. customcarditem:: + :header: Neural Tangent Kernels + :card_description: Learn how to compute neural tangent kernels using torch.func + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/neural_tangent_kernels.html + :tags: Frontend-APIs + +.. customcarditem:: + :header: Using the PyTorch C++ Frontend + :card_description: Walk through an end-to-end example of training a model with the C++ frontend by training a DCGAN – a kind of generative model – to generate images of MNIST digits. + :image: _static/img/thumbnails/cropped/Using-the-PyTorch-Cpp-Frontend.png + :link: advanced/cpp_frontend.html + :tags: Frontend-APIs,C++ + +.. customcarditem:: + :header: Autograd in C++ Frontend + :card_description: The autograd package helps build flexible and dynamic nerural netorks. In this tutorial, exploreseveral examples of doing autograd in PyTorch C++ frontend + :image: _static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png + :link: advanced/cpp_autograd.html + :tags: Frontend-APIs,C++ + +.. End of tutorial card section +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + + beginner/profiler + beginner/vt_tutorial + intermediate/parametrizations + intermediate/pruning_tutorial + intermediate/inductor_debug_cpu + intermediate/scaled_dot_product_attention_tutorial + beginner/knowledge_distillation_tutorial + +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Frontend APIs + + intermediate/memory_format_tutorial + intermediate/forward_ad_usage + intermediate/jacobians_hessians + intermediate/ensembling + intermediate/per_sample_grads + intermediate/neural_tangent_kernels.py + advanced/cpp_frontend + advanced/cpp_autograd diff --git a/distributed/home.rst b/distributed.rst similarity index 83% rename from distributed/home.rst rename to distributed.rst index 41e648ff625..8fe636d7255 100644 --- a/distributed/home.rst +++ b/distributed.rst @@ -1,5 +1,5 @@ -Distributed and Parallel Training Tutorials -=========================================== +Distributed +=========== Distributed training is a model training paradigm that involves spreading training workload across multiple worker nodes, therefore @@ -12,13 +12,14 @@ There are a few ways you can perform distributed training in PyTorch with each method having their advantages in certain use cases: * `DistributedDataParallel (DDP) <#learn-ddp>`__ -* `Fully Sharded Data Parallel (FSDP) <#learn-fsdp>`__ +* `Fully Sharded Data Parallel (FSDP2) <#learn-fsdp>`__ * `Tensor Parallel (TP) <#learn-tp>`__ * `Device Mesh <#device-mesh>`__ * `Remote Procedure Call (RPC) distributed training <#learn-rpc>`__ +* `Monarch Framework <#learn-monarch>`__ * `Custom Extensions <#custom-extensions>`__ -Read more about these options in `Distributed Overview <../beginner/dist_overview.html>`__. +Read more about these options in `Distributed Overview `__. .. _learn-ddp: @@ -60,28 +61,18 @@ Learn DDP .. _learn-fsdp: -Learn FSDP +Learn FSDP2 ---------- .. grid:: 3 .. grid-item-card:: :octicon:`file-code;1em` - Getting Started with FSDP + Getting Started with FSDP2 :link: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_getting_started :link-type: url This tutorial demonstrates how you can perform distributed training - with FSDP on a MNIST dataset. - +++ - :octicon:`code;1em` Code - - .. grid-item-card:: :octicon:`file-code;1em` - FSDP Advanced - :link: https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_advanced - :link-type: url - - In this tutorial, you will learn how to fine-tune a HuggingFace (HF) T5 - model with FSDP for text summarization. + with FSDP2 on a transformer model +++ :octicon:`code;1em` Code @@ -169,6 +160,22 @@ Learn RPC +++ :octicon:`code;1em` Code +.. _learn-monarch: + +Learn Monarch +---------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Interactive Distributed Applications with Monarch + :link: https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html + :link-type: url + + Learn how to use Monarch's actor framework + +++ + :octicon:`code;1em` Code + .. _custom-extensions: Custom Extensions @@ -186,3 +193,23 @@ Custom Extensions cpp extensions. +++ :octicon:`code;1em` Code + +.. toctree:: + :hidden: + :maxdepth: 2 + + beginner/dist_overview + beginner/ddp_series_intro + intermediate/ddp_tutorial + intermediate/dist_tuto + intermediate/FSDP_tutorial + intermediate/TCPStore_libuv_backend + intermediate/TP_tutorial + intermediate/pipelining_tutorial + intermediate/process_group_cpp_extension_tutorial + intermediate/rpc_tutorial + intermediate/rpc_param_server_tutorial + intermediate/rpc_async_execution + intermediate/monarch_distributed_tutorial + advanced/rpc_ddp_tutorial + advanced/generic_join diff --git a/docathon-leaderboard.md b/docathon-leaderboard.md index 49912c2abfb..de12a13cca8 100644 --- a/docathon-leaderboard.md +++ b/docathon-leaderboard.md @@ -1,117 +1,164 @@ +# 🎉 PyTorch Docathon Leaderboard 2025 🎉 + +This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2025 docathon. A big shout out to everyone who have participated! +We have awarded points for each merged PR as follows: + +* easy - 2 points +* medium - 5 points +* advanced - 10 points + +We have granted half points (1, 2, and 5 respectively) for all additional PRs merged against the same issue. +In some cases, we have awarded credit for the PRs that were not merged or issues that have been closed without a merged PR. + +| Rank | Author | Points | PRs | +|:---:|:------------|------:|:----| +| 🥇 | [j-silv](https://github.com/j-silv) | 31 | [#155753](https://github.com/pytorch/pytorch/pull/155753), [#155659](https://github.com/pytorch/pytorch/pull/155659), [#155567](https://github.com/pytorch/pytorch/pull/155567), [#155540](https://github.com/pytorch/pytorch/pull/155540), [#155528](https://github.com/pytorch/pytorch/pull/155528), [#155198](https://github.com/pytorch/pytorch/pull/155198), [#155093](https://github.com/pytorch/pytorch/pull/155093), [#3389](https://github.com/pytorch/tutorials/pull/3389) | +| 🥇 | [windsonsea](https://github.com/windsonsea) | 19 | [#155789](https://github.com/pytorch/pytorch/pull/155789), [#155520](https://github.com/pytorch/pytorch/pull/155520), [#156039](https://github.com/pytorch/pytorch/pull/156039), [#156009](https://github.com/pytorch/pytorch/pull/156009), [#155653](https://github.com/pytorch/pytorch/pull/155653) | +| 🥇 | [kiszk](https://github.com/kiszk) | 16 | [#155762](https://github.com/pytorch/pytorch/pull/155762), [#155514](https://github.com/pytorch/pytorch/pull/155514), [#155351](https://github.com/pytorch/pytorch/pull/155351), [#155348](https://github.com/pytorch/pytorch/pull/155348), [#155347](https://github.com/pytorch/pytorch/pull/155347) | +| 🥈 | [Rachel0619](https://github.com/Rachel0619) | 14 | [#155764](https://github.com/pytorch/pytorch/pull/155764), [#155482](https://github.com/pytorch/pytorch/pull/155482), [#3385](https://github.com/pytorch/tutorials/pull/3385), [#3381](https://github.com/pytorch/tutorials/pull/3381) | +| 🥈 | [jafraustro](https://github.com/jafraustro) | 14 | [#155523](https://github.com/pytorch/pytorch/pull/155523), [#155369](https://github.com/pytorch/pytorch/pull/155369), [#133563](https://github.com/pytorch/pytorch/issues/133563), [#129446](https://github.com/pytorch/pytorch/issues/129446) | +| 🥈 | [Dhia-naouali](https://github.com/Dhia-naouali) | 12 | [#155911](https://github.com/pytorch/pytorch/pull/155911), [#155840](https://github.com/pytorch/pytorch/pull/155840), [#155505](https://github.com/pytorch/pytorch/pull/155505) | +| 🥈 | [loganthomas](https://github.com/loganthomas) | 12 | [#155702](https://github.com/pytorch/pytorch/pull/155702), [#155088](https://github.com/pytorch/pytorch/pull/155088), [#155649](https://github.com/pytorch/pytorch/pull/155649) | +| 🥈 | [nirajkamal](https://github.com/nirajkamal) | 12 | [#155430](https://github.com/pytorch/pytorch/pull/155430), [#155228](https://github.com/pytorch/pytorch/pull/155228), [#3376](https://github.com/pytorch/tutorials/pull/3376) | +| 🥉 | [Juliandlb](https://github.com/Juliandlb) | 10 | [#155987](https://github.com/pytorch/pytorch/pull/155987), [#155618](https://github.com/pytorch/pytorch/pull/155618) | +| 🥉 | [ggsmith842](https://github.com/ggsmith842) | 7 | [#155767](https://github.com/pytorch/pytorch/pull/155767), [#155297](https://github.com/pytorch/pytorch/pull/155297) | +| 🥉 | [ParagEkbote](https://github.com/ParagEkbote) | 7 | [#155683](https://github.com/pytorch/pytorch/pull/155683), [#155341](https://github.com/pytorch/pytorch/pull/155341) | +| ⭐ | [GdoongMathew](https://github.com/GdoongMathew) | 5 | [#155813](https://github.com/pytorch/pytorch/pull/155813) | +| ⭐ | [eromomon](https://github.com/eromomon) | 5 | [#155696](https://github.com/pytorch/pytorch/pull/155696) | +| ⭐ | [dggaytan](https://github.com/dggaytan) | 5 | [#155377](https://github.com/pytorch/pytorch/pull/155377) | +| ⭐ | [spzala](https://github.com/spzala) | 5 | [#155335](https://github.com/pytorch/pytorch/pull/155335) | +| ⭐ | [framoncg](https://github.com/framoncg) | 5 | [#155298](https://github.com/pytorch/pytorch/pull/155298) | +| ⭐ | [abhinav-TB](https://github.com/abhinav-TB) | 5 | [#155252](https://github.com/pytorch/pytorch/pull/155252) | +| ⭐ | [aagalleg](https://github.com/aagalleg) | 5 | [#155137](https://github.com/pytorch/pytorch/pull/155137) | +| ⭐ | [kiersten-stokes](https://github.com/kiersten-stokes) | 5 | [#155067](https://github.com/pytorch/pytorch/pull/155067) | +| ⭐ | [krishnakalyan3](https://github.com/krishnakalyan3) | 5 | [#3387](https://github.com/pytorch/tutorials/pull/3387) | +| ⭐ | [splion-360](https://github.com/splion-360) | 5 | [#3384](https://github.com/pytorch/tutorials/pull/3384) | +| ⭐ | [harshaljanjani](https://github.com/harshaljanjani) | 5 | [#3377](https://github.com/pytorch/tutorials/pull/3377) | +| ⭐ | [b-koopman](https://github.com/b-koopman) | 4 | [#155100](https://github.com/pytorch/pytorch/pull/155100), [#155889](https://github.com/pytorch/pytorch/pull/155889) | +| ⭐ | [thatgeeman](https://github.com/thatgeeman) | 4 | [#155404](https://github.com/pytorch/pytorch/pull/155404), [#156094](https://github.com/pytorch/pytorch/pull/156094) | +| ⭐ | [frost-intel](https://github.com/frost-intel) | 2 | [#3393](https://github.com/pytorch/tutorials/pull/3393) | +| ⭐ | [ANotFox](https://github.com/ANotFox) | 2 | [#155148](https://github.com/pytorch/pytorch/pull/155148) | +| ⭐ | [QasimKhan5x](https://github.com/QasimKhan5x) | 2 | [#155074](https://github.com/pytorch/pytorch/pull/155074) | +| ⭐ | [Ashish-Soni08](https://github.com/Ashish-Soni08) | 2 | [#3379](https://github.com/pytorch/tutorials/pull/3379) | +| ⭐ | [FORTFANOP](https://github.com/FORTFANOP) | 2 | [#3378](https://github.com/pytorch/tutorials/pull/3378) | +| ⭐ | [newtdms ](https://github.com/newtdms ) | 2 | [#155497](https://github.com/pytorch/pytorch/pull/155497) | +| ⭐ | [srini047](https://github.com/srini047) | 2 | [#155554](https://github.com/pytorch/pytorch/pull/155554) | + + # 🎉 Docathon H1 2024 Leaderboard 🎉 -This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2024 docathon. -A big shout out to everyone who have participated! We have awarded points for each merged PR. -For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. -For the **advanced** label, we have awarded 10 points. In some cases, we have awarded credit for the PRs that -were not merged or issues that have been closed without a merged PR. +This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2024 docathon. +A big shout out to everyone who have participated! We have awarded points for each merged PR. +For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. +For the **advanced** label, we have awarded 10 points. In some cases, we have awarded credit for the PRs that +were not merged or issues that have been closed without a merged PR. | Author | Points | PR | |--- | --- | ---| -| ahoblitz | 34 | https://github.com/pytorch/pytorch/pull/128566, https://github.com/pytorch/pytorch/pull/128408, https://github.com/pytorch/pytorch/pull/128171, https://github.com/pytorch/pytorch/pull/128083, https://github.com/pytorch/pytorch/pull/128082, https://github.com/pytorch/pytorch/pull/127983, https://github.com/pytorch/xla/pull/7214 | -| afrittoli | 25 | https://github.com/pytorch/pytorch/pull/128139, https://github.com/pytorch/pytorch/pull/128133, https://github.com/pytorch/pytorch/pull/128132, https://github.com/pytorch/pytorch/pull/128129, https://github.com/pytorch/pytorch/pull/128127 | -| kiszk | 20 | https://github.com/pytorch/pytorch/pull/128337, https://github.com/pytorch/pytorch/pull/128123, https://github.com/pytorch/pytorch/pull/128022, https://github.com/pytorch/pytorch/pull/128312 | -| loganthomas | 19 | https://github.com/pytorch/pytorch/pull/128676, https://github.com/pytorch/pytorch/pull/128192, https://github.com/pytorch/pytorch/pull/128189, https://github.com/pytorch/tutorials/pull/2922, https://github.com/pytorch/tutorials/pull/2910, https://github.com/pytorch/xla/pull/7195 | -| ignaciobartol | 17 | https://github.com/pytorch/pytorch/pull/128741, https://github.com/pytorch/pytorch/pull/128135, https://github.com/pytorch/pytorch/pull/127938, https://github.com/pytorch/tutorials/pull/2936 | -| arunppsg | 17 | https://github.com/pytorch/pytorch/pull/128391, https://github.com/pytorch/pytorch/pull/128021, https://github.com/pytorch/pytorch/pull/128018, https://github.com/pytorch-labs/torchfix/pull/59 | -| alperenunlu | 17 | https://github.com/pytorch/tutorials/pull/2934, https://github.com/pytorch/tutorials/pull/2909, https://github.com/pytorch/pytorch/pull/104043 | -| anandptl84 | 10 | https://github.com/pytorch/pytorch/pull/128196, https://github.com/pytorch/pytorch/pull/128098 | -| GdoongMathew | 10 | https://github.com/pytorch/pytorch/pull/128136, https://github.com/pytorch/pytorch/pull/128051 | -| ZhaoqiongZ | 10 | https://github.com/pytorch/pytorch/pull/127872 | -| ZailiWang | 10 | https://github.com/pytorch/tutorials/pull/2931 | -| jingxu10 | 8 | https://github.com/pytorch/pytorch/pull/127280, https://github.com/pytorch/pytorch/pull/127279, https://github.com/pytorch/pytorch/pull/127278, https://github.com/pytorch/tutorials/pull/2919 | -| sitamgithub-MSIT | 7 | https://github.com/pytorch/tutorials/pull/2900, https://github.com/pytorch/xla/pull/7208 | -| spzala | 5 | https://github.com/pytorch/pytorch/pull/128679, https://github.com/pytorch/pytorch/pull/128657 | -| TharinduRusira | 5 | https://github.com/pytorch/pytorch/pull/128197 | -| zabboud | 5 | https://github.com/pytorch/pytorch/pull/128055 | -| orion160 | 5 | https://github.com/pytorch/tutorials/pull/2912 | -| Ricktho1 | 5 | https://github.com/pytorch/xla/pull/7273 | -| IvanLauLinTiong | 4 | https://github.com/pytorch/pytorch/pull/128526, https://github.com/pytorch/tutorials/pull/2849 | -| sshkhr | 2 | https://github.com/pytorch/pytorch/pull/128155 | -| rk7697 | 2 | https://github.com/pytorch/pytorch/pull/127993 | -| hippocookie | 2 | https://github.com/pytorch/tutorials/pull/2937 | -| diningeachox | 2 | https://github.com/pytorch/tutorials/pull/2935 | -| akhil-maker | 2 | https://github.com/pytorch/tutorials/pull/2899 | -| saurabhkthakur | 2 | https://github.com/pytorch/tutorials/pull/2896 | +| ahoblitz | 34 | https://github.com/pytorch/pytorch/pull/128566, https://github.com/pytorch/pytorch/pull/128408, https://github.com/pytorch/pytorch/pull/128171, https://github.com/pytorch/pytorch/pull/128083, https://github.com/pytorch/pytorch/pull/128082, https://github.com/pytorch/pytorch/pull/127983, https://github.com/pytorch/xla/pull/7214 | +| afrittoli | 25 | https://github.com/pytorch/pytorch/pull/128139, https://github.com/pytorch/pytorch/pull/128133, https://github.com/pytorch/pytorch/pull/128132, https://github.com/pytorch/pytorch/pull/128129, https://github.com/pytorch/pytorch/pull/128127 | +| kiszk | 20 | https://github.com/pytorch/pytorch/pull/128337, https://github.com/pytorch/pytorch/pull/128123, https://github.com/pytorch/pytorch/pull/128022, https://github.com/pytorch/pytorch/pull/128312 | +| loganthomas | 19 | https://github.com/pytorch/pytorch/pull/128676, https://github.com/pytorch/pytorch/pull/128192, https://github.com/pytorch/pytorch/pull/128189, https://github.com/pytorch/tutorials/pull/2922, https://github.com/pytorch/tutorials/pull/2910, https://github.com/pytorch/xla/pull/7195 | +| ignaciobartol | 17 | https://github.com/pytorch/pytorch/pull/128741, https://github.com/pytorch/pytorch/pull/128135, https://github.com/pytorch/pytorch/pull/127938, https://github.com/pytorch/tutorials/pull/2936 | +| arunppsg | 17 | https://github.com/pytorch/pytorch/pull/128391, https://github.com/pytorch/pytorch/pull/128021, https://github.com/pytorch/pytorch/pull/128018, https://github.com/meta-pytorch/torchfix/pull/59 | +| alperenunlu | 17 | https://github.com/pytorch/tutorials/pull/2934, https://github.com/pytorch/tutorials/pull/2909, https://github.com/pytorch/pytorch/pull/104043 | +| anandptl84 | 10 | https://github.com/pytorch/pytorch/pull/128196, https://github.com/pytorch/pytorch/pull/128098 | +| GdoongMathew | 10 | https://github.com/pytorch/pytorch/pull/128136, https://github.com/pytorch/pytorch/pull/128051 | +| ZhaoqiongZ | 10 | https://github.com/pytorch/pytorch/pull/127872 | +| ZailiWang | 10 | https://github.com/pytorch/tutorials/pull/2931 | +| jingxu10 | 8 | https://github.com/pytorch/pytorch/pull/127280, https://github.com/pytorch/pytorch/pull/127279, https://github.com/pytorch/pytorch/pull/127278, https://github.com/pytorch/tutorials/pull/2919 | +| sitamgithub-MSIT | 7 | https://github.com/pytorch/tutorials/pull/2900, https://github.com/pytorch/xla/pull/7208 | +| spzala | 5 | https://github.com/pytorch/pytorch/pull/128679, https://github.com/pytorch/pytorch/pull/128657 | +| TharinduRusira | 5 | https://github.com/pytorch/pytorch/pull/128197 | +| zabboud | 5 | https://github.com/pytorch/pytorch/pull/128055 | +| orion160 | 5 | https://github.com/pytorch/tutorials/pull/2912 | +| Ricktho1 | 5 | https://github.com/pytorch/xla/pull/7273 | +| IvanLauLinTiong | 4 | https://github.com/pytorch/pytorch/pull/128526, https://github.com/pytorch/tutorials/pull/2849 | +| sshkhr | 2 | https://github.com/pytorch/pytorch/pull/128155 | +| rk7697 | 2 | https://github.com/pytorch/pytorch/pull/127993 | +| hippocookie | 2 | https://github.com/pytorch/tutorials/pull/2937 | +| diningeachox | 2 | https://github.com/pytorch/tutorials/pull/2935 | +| akhil-maker | 2 | https://github.com/pytorch/tutorials/pull/2899 | +| saurabhkthakur | 2 | https://github.com/pytorch/tutorials/pull/2896 | # 🎉 Docathon H2 2023 Leaderboard 🎉 -This is the list of the docathon contributors that have participated and contributed to the H2 2023 PyTorch docathon. -A big shout out to everyone who have participated! We have awarded points for each merged PR. -For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. -For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that +This is the list of the docathon contributors that have participated and contributed to the H2 2023 PyTorch docathon. +A big shout out to everyone who have participated! We have awarded points for each merged PR. +For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. +For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that were not merged or issues that have been closed without a merged PR. Thank you all for your awesome contributions! 🎉 | Author | Points | PR | |--- | --- | ---| -| ahoblitz | 25 | https://github.com/pytorch/pytorch/pull/112992, https://github.com/pytorch/tutorials/pull/2662, https://github.com/pytorch/tutorials/pull/2647, https://github.com/pytorch/tutorials/pull/2642, https://github.com/pytorch/tutorials/pull/2640, https://github.com/pytorch/pytorch/pull/113092, https://github.com/pytorch/pytorch/pull/113348 | -| ChanBong | 22 | https://github.com/pytorch/pytorch/pull/113337, https://github.com/pytorch/pytorch/pull/113336, https://github.com/pytorch/pytorch/pull/113335, https://github.com/pytorch/tutorials/pull/2644, https://github.com/pytorch/tutorials/pull/2639 | -| alperenunlu | 22 | https://github.com/pytorch/pytorch/pull/113260, https://github.com/pytorch/tutorials/pull/2673, https://github.com/pytorch/tutorials/pull/2660, https://github.com/pytorch/tutorials/pull/2656, https://github.com/pytorch/tutorials/pull/2649, https://github.com/pytorch/pytorch/pull/113505, https://github.com/pytorch/pytorch/pull/113218, https://github.com/pytorch/pytorch/pull/113505 | -| spzala | 22 | https://github.com/pytorch/pytorch/pull/113200, https://github.com/pytorch/pytorch/pull/112693, https://github.com/pytorch/tutorials/pull/2667, https://github.com/pytorch/tutorials/pull/2635 | -| bjhargrave | 21 | https://github.com/pytorch/pytorch/pull/113358, https://github.com/pytorch/pytorch/pull/113206, https://github.com/pytorch/pytorch/pull/112786, https://github.com/pytorch/tutorials/pull/2661, https://github.com/pytorch/tutorials/pull/1272 | -| zabboud | 21 | https://github.com/pytorch/pytorch/pull/113233, https://github.com/pytorch/pytorch/pull/113227, https://github.com/pytorch/pytorch/pull/113177, https://github.com/pytorch/pytorch/pull/113219, https://github.com/pytorch/pytorch/pull/113311 | -| nvs-abhilash | 20 | https://github.com/pytorch/pytorch/pull/113241, https://github.com/pytorch/pytorch/pull/112765, https://github.com/pytorch/pytorch/pull/112695, https://github.com/pytorch/pytorch/pull/112657 | -| guptaaryan16 | 19 | https://github.com/pytorch/pytorch/pull/112817, https://github.com/pytorch/pytorch/pull/112735, https://github.com/pytorch/tutorials/pull/2674, https://github.com/pytorch/pytorch/pull/113196, https://github.com/pytorch/pytorch/pull/113532 | -| min-jean-cho | 17 | https://github.com/pytorch/pytorch/pull/113195, https://github.com/pytorch/pytorch/pull/113183, https://github.com/pytorch/pytorch/pull/113178, https://github.com/pytorch/pytorch/pull/113109, https://github.com/pytorch/pytorch/pull/112892 | -| markstur | 14 | https://github.com/pytorch/pytorch/pull/113250, https://github.com/pytorch/tutorials/pull/2643, https://github.com/pytorch/tutorials/pull/2638, https://github.com/pytorch/tutorials/pull/2636 | -| RustyGrackle | 13 | https://github.com/pytorch/pytorch/pull/113371, https://github.com/pytorch/pytorch/pull/113266, https://github.com/pytorch/pytorch/pull/113435 | -| Viditagarwal7479 | 12 | https://github.com/pytorch/pytorch/pull/112860, https://github.com/pytorch/tutorials/pull/2659, https://github.com/pytorch/tutorials/pull/2671 | -| kiszk | 10 | https://github.com/pytorch/pytorch/pull/113523, https://github.com/pytorch/pytorch/pull/112751 | -| awaelchli | 10 | https://github.com/pytorch/pytorch/pull/113216, https://github.com/pytorch/pytorch/pull/112674 | -| pilot-j | 10 | https://github.com/pytorch/pytorch/pull/112964, https://github.com/pytorch/pytorch/pull/112856 | -| krishnakalyan3 | 7 | https://github.com/pytorch/tutorials/pull/2653, https://github.com/pytorch/tutorials/pull/1235, https://github.com/pytorch/tutorials/pull/1705 | -| ash-01xor | 5 | https://github.com/pytorch/pytorch/pull/113511 | -| IvanLauLinTiong | 5 | https://github.com/pytorch/pytorch/pull/113052 | -| Senthi1Kumar | 5 | https://github.com/pytorch/pytorch/pull/113021 | -| ooooo-create | 5 | https://github.com/pytorch/pytorch/pull/112953 | -| stanleyedward | 5 | https://github.com/pytorch/pytorch/pull/112864, https://github.com/pytorch/pytorch/pull/112617 | -| leslie-fang-intel | 5 | https://github.com/pytorch/tutorials/pull/2668 | -| measty | 5 | https://github.com/pytorch/tutorials/pull/2675 | -| Hhhhhhao | 5 | https://github.com/pytorch/tutorials/pull/2676 | -| andrewashere | 3 | https://github.com/pytorch/pytorch/pull/112721 | -| aalhendi | 3 | https://github.com/pytorch/pytorch/pull/112947 | -| sitamgithub-MSIT | 3 | https://github.com/pytorch/pytorch/pull/113264 | -| Jarlaze | 3 | https://github.com/pytorch/pytorch/pull/113531 | -| jingxu10 | 2 | https://github.com/pytorch/tutorials/pull/2657 | -| cirquit | 2 | https://github.com/pytorch/tutorials/pull/2529 | -| prithviraj-maurya | 1 | https://github.com/pytorch/tutorials/pull/2652 | -| MirMustafaAli | 1 | https://github.com/pytorch/tutorials/pull/2645 | +| ahoblitz | 25 | https://github.com/pytorch/pytorch/pull/112992, https://github.com/pytorch/tutorials/pull/2662, https://github.com/pytorch/tutorials/pull/2647, https://github.com/pytorch/tutorials/pull/2642, https://github.com/pytorch/tutorials/pull/2640, https://github.com/pytorch/pytorch/pull/113092, https://github.com/pytorch/pytorch/pull/113348 | +| ChanBong | 22 | https://github.com/pytorch/pytorch/pull/113337, https://github.com/pytorch/pytorch/pull/113336, https://github.com/pytorch/pytorch/pull/113335, https://github.com/pytorch/tutorials/pull/2644, https://github.com/pytorch/tutorials/pull/2639 | +| alperenunlu | 22 | https://github.com/pytorch/pytorch/pull/113260, https://github.com/pytorch/tutorials/pull/2673, https://github.com/pytorch/tutorials/pull/2660, https://github.com/pytorch/tutorials/pull/2656, https://github.com/pytorch/tutorials/pull/2649, https://github.com/pytorch/pytorch/pull/113505, https://github.com/pytorch/pytorch/pull/113218, https://github.com/pytorch/pytorch/pull/113505 | +| spzala | 22 | https://github.com/pytorch/pytorch/pull/113200, https://github.com/pytorch/pytorch/pull/112693, https://github.com/pytorch/tutorials/pull/2667, https://github.com/pytorch/tutorials/pull/2635 | +| bjhargrave | 21 | https://github.com/pytorch/pytorch/pull/113358, https://github.com/pytorch/pytorch/pull/113206, https://github.com/pytorch/pytorch/pull/112786, https://github.com/pytorch/tutorials/pull/2661, https://github.com/pytorch/tutorials/pull/1272 | +| zabboud | 21 | https://github.com/pytorch/pytorch/pull/113233, https://github.com/pytorch/pytorch/pull/113227, https://github.com/pytorch/pytorch/pull/113177, https://github.com/pytorch/pytorch/pull/113219, https://github.com/pytorch/pytorch/pull/113311 | +| nvs-abhilash | 20 | https://github.com/pytorch/pytorch/pull/113241, https://github.com/pytorch/pytorch/pull/112765, https://github.com/pytorch/pytorch/pull/112695, https://github.com/pytorch/pytorch/pull/112657 | +| guptaaryan16 | 19 | https://github.com/pytorch/pytorch/pull/112817, https://github.com/pytorch/pytorch/pull/112735, https://github.com/pytorch/tutorials/pull/2674, https://github.com/pytorch/pytorch/pull/113196, https://github.com/pytorch/pytorch/pull/113532 | +| min-jean-cho | 17 | https://github.com/pytorch/pytorch/pull/113195, https://github.com/pytorch/pytorch/pull/113183, https://github.com/pytorch/pytorch/pull/113178, https://github.com/pytorch/pytorch/pull/113109, https://github.com/pytorch/pytorch/pull/112892 | +| markstur | 14 | https://github.com/pytorch/pytorch/pull/113250, https://github.com/pytorch/tutorials/pull/2643, https://github.com/pytorch/tutorials/pull/2638, https://github.com/pytorch/tutorials/pull/2636 | +| RustyGrackle | 13 | https://github.com/pytorch/pytorch/pull/113371, https://github.com/pytorch/pytorch/pull/113266, https://github.com/pytorch/pytorch/pull/113435 | +| Viditagarwal7479 | 12 | https://github.com/pytorch/pytorch/pull/112860, https://github.com/pytorch/tutorials/pull/2659, https://github.com/pytorch/tutorials/pull/2671 | +| kiszk | 10 | https://github.com/pytorch/pytorch/pull/113523, https://github.com/pytorch/pytorch/pull/112751 | +| awaelchli | 10 | https://github.com/pytorch/pytorch/pull/113216, https://github.com/pytorch/pytorch/pull/112674 | +| pilot-j | 10 | https://github.com/pytorch/pytorch/pull/112964, https://github.com/pytorch/pytorch/pull/112856 | +| krishnakalyan3 | 7 | https://github.com/pytorch/tutorials/pull/2653, https://github.com/pytorch/tutorials/pull/1235, https://github.com/pytorch/tutorials/pull/1705 | +| ash-01xor | 5 | https://github.com/pytorch/pytorch/pull/113511 | +| IvanLauLinTiong | 5 | https://github.com/pytorch/pytorch/pull/113052 | +| Senthi1Kumar | 5 | https://github.com/pytorch/pytorch/pull/113021 | +| ooooo-create | 5 | https://github.com/pytorch/pytorch/pull/112953 | +| stanleyedward | 5 | https://github.com/pytorch/pytorch/pull/112864, https://github.com/pytorch/pytorch/pull/112617 | +| leslie-fang-intel | 5 | https://github.com/pytorch/tutorials/pull/2668 | +| measty | 5 | https://github.com/pytorch/tutorials/pull/2675 | +| Hhhhhhao | 5 | https://github.com/pytorch/tutorials/pull/2676 | +| andrewashere | 3 | https://github.com/pytorch/pytorch/pull/112721 | +| aalhendi | 3 | https://github.com/pytorch/pytorch/pull/112947 | +| sitamgithub-MSIT | 3 | https://github.com/pytorch/pytorch/pull/113264 | +| Jarlaze | 3 | https://github.com/pytorch/pytorch/pull/113531 | +| jingxu10 | 2 | https://github.com/pytorch/tutorials/pull/2657 | +| cirquit | 2 | https://github.com/pytorch/tutorials/pull/2529 | +| prithviraj-maurya | 1 | https://github.com/pytorch/tutorials/pull/2652 | +| MirMustafaAli | 1 | https://github.com/pytorch/tutorials/pull/2645 | # 🎉 Docathon H1 2023 Leaderboard 🎉 -This is the list of the docathon contributors that have participated and contributed to the PyTorch docathon. -A big shout out to everyone who have participated! We have awarded points for each merged PR. -For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. -For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that +This is the list of the docathon contributors that have participated and contributed to the PyTorch docathon. +A big shout out to everyone who have participated! We have awarded points for each merged PR. +For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. +For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that were not merged or issues that have been closed without a merged PR. | Author | Points | PR | |--- | --- | ---| -| JoseLuisC99 | 22 | https://github.com/pytorch/tutorials/pull/2468, https://github.com/pytorch/tutorials/pull/2404, https://github.com/pytorch/tutorials/pull/2403, https://github.com/pytorch/tutorials/pull/2372, https://github.com/pytorch/examples/pull/1163, https://github.com/pytorch/tutorials/pull/2432 | -| QasimKhan5x | 21 | https://github.com/pytorch/tutorials/pull/2452, https://github.com/pytorch/tutorials/pull/2419, https://github.com/pytorch/tutorials/pull/2408, https://github.com/pytorch/tutorials/pull/2397, https://github.com/pytorch/tutorials/pull/2385, https://github.com/pytorch/tutorials/pull/2383 | -| bjhargrave | 12 | https://github.com/pytorch/tutorials/pull/2428, https://github.com/pytorch/tutorials/pull/2424, https://github.com/pytorch/tutorials/pull/2423 | -| Aidyn-A | 10 | https://github.com/pytorch/tutorials/pull/2441 | -| CaoE | 10 | https://github.com/pytorch/tutorials/pull/2439 | -| HemanthSai7 | 10 | https://github.com/pytorch/tutorials/pull/2392, https://github.com/pytorch/tutorials/pull/2375 | -| leslie-fang-intel | 10 | https://github.com/pytorch/tutorials/pull/2354 | -| Valentine233 | 10 | https://github.com/pytorch/tutorials/pull/2430 | -| TheMemoryDealer | 9 | https://github.com/pytorch/tutorials/pull/2389, https://github.com/pytorch/tutorials/pull/2369, https://github.com/pytorch/tutorials/pull/2367 | -| arunppsg | 8 | https://github.com/pytorch/tutorials/pull/2384, https://github.com/pytorch/tutorials/pull/821 | -| noqqaqq | 7 | https://github.com/pytorch/tutorials/pull/2407, https://github.com/pytorch/tutorials/pull/2386 | -| zabboud | 7 | https://github.com/pytorch/tutorials/pull/2405, https://github.com/pytorch/tutorials/pull/2400 | -| kiersten-stokes | 7 | https://github.com/pytorch/tutorials/pull/2401, https://github.com/pytorch/tutorials/pull/2398 | -| frasertajima | 6 | https://github.com/pytorch/tutorials/pull/2370, https://github.com/pytorch/tutorials/pull/2368, https://github.com/pytorch/tutorials/pull/2363 | -| nairbv | 5 | https://github.com/pytorch/tutorials/pull/2413 | -| mikebrow | 5 | https://github.com/pytorch/tutorials/pull/2374 | -| NeoKish | 4 | https://github.com/pytorch/tutorials/pull/2364, https://github.com/pytorch/tutorials/pull/2361 | -| fabiogomez11c | 3 | https://github.com/pytorch/tutorials/pull/2362, https://github.com/pytorch/tutorials/pull/1011 | -| onurtore | 2 | https://github.com/pytorch/tutorials/pull/2458 | -| NM512 | 2 | https://github.com/pytorch/tutorials/pull/2451 | -| j3soon | 2 | https://github.com/pytorch/tutorials/pull/2420 | -| Samsonboadi | 2 | https://github.com/pytorch/tutorials/pull/2406 | -| neuralninja27 | 2 | https://github.com/pytorch/tutorials/pull/2381 | -| akjalok | 2 | https://github.com/pytorch/tutorials/pull/2380 | -| tcNickolas | 2 | https://github.com/pytorch/tutorials/pull/2378 | -| Suhas-G | 2 | https://github.com/pytorch/tutorials/pull/2371 | -| BeniaminC | 2 | https://github.com/pytorch/tutorials/pull/2366 | -| ver2king | 2 | https://github.com/pytorch/tutorials/pull/2445, https://github.com/pytorch/tutorials/pull/2459 | -| mikgor | 1 | https://github.com/pytorch/tutorials/pull/2417 | -| spzala | 1 | https://github.com/pytorch/tutorials/pull/1579 | +| JoseLuisC99 | 22 | https://github.com/pytorch/tutorials/pull/2468, https://github.com/pytorch/tutorials/pull/2404, https://github.com/pytorch/tutorials/pull/2403, https://github.com/pytorch/tutorials/pull/2372, https://github.com/pytorch/examples/pull/1163, https://github.com/pytorch/tutorials/pull/2432 | +| QasimKhan5x | 21 | https://github.com/pytorch/tutorials/pull/2452, https://github.com/pytorch/tutorials/pull/2419, https://github.com/pytorch/tutorials/pull/2408, https://github.com/pytorch/tutorials/pull/2397, https://github.com/pytorch/tutorials/pull/2385, https://github.com/pytorch/tutorials/pull/2383 | +| bjhargrave | 12 | https://github.com/pytorch/tutorials/pull/2428, https://github.com/pytorch/tutorials/pull/2424, https://github.com/pytorch/tutorials/pull/2423 | +| Aidyn-A | 10 | https://github.com/pytorch/tutorials/pull/2441 | +| CaoE | 10 | https://github.com/pytorch/tutorials/pull/2439 | +| HemanthSai7 | 10 | https://github.com/pytorch/tutorials/pull/2392, https://github.com/pytorch/tutorials/pull/2375 | +| leslie-fang-intel | 10 | https://github.com/pytorch/tutorials/pull/2354 | +| Valentine233 | 10 | https://github.com/pytorch/tutorials/pull/2430 | +| TheMemoryDealer | 9 | https://github.com/pytorch/tutorials/pull/2389, https://github.com/pytorch/tutorials/pull/2369, https://github.com/pytorch/tutorials/pull/2367 | +| arunppsg | 8 | https://github.com/pytorch/tutorials/pull/2384, https://github.com/pytorch/tutorials/pull/821 | +| noqqaqq | 7 | https://github.com/pytorch/tutorials/pull/2407, https://github.com/pytorch/tutorials/pull/2386 | +| zabboud | 7 | https://github.com/pytorch/tutorials/pull/2405, https://github.com/pytorch/tutorials/pull/2400 | +| kiersten-stokes | 7 | https://github.com/pytorch/tutorials/pull/2401, https://github.com/pytorch/tutorials/pull/2398 | +| frasertajima | 6 | https://github.com/pytorch/tutorials/pull/2370, https://github.com/pytorch/tutorials/pull/2368, https://github.com/pytorch/tutorials/pull/2363 | +| nairbv | 5 | https://github.com/pytorch/tutorials/pull/2413 | +| mikebrow | 5 | https://github.com/pytorch/tutorials/pull/2374 | +| NeoKish | 4 | https://github.com/pytorch/tutorials/pull/2364, https://github.com/pytorch/tutorials/pull/2361 | +| fabiogomez11c | 3 | https://github.com/pytorch/tutorials/pull/2362, https://github.com/pytorch/tutorials/pull/1011 | +| onurtore | 2 | https://github.com/pytorch/tutorials/pull/2458 | +| NM512 | 2 | https://github.com/pytorch/tutorials/pull/2451 | +| j3soon | 2 | https://github.com/pytorch/tutorials/pull/2420 | +| Samsonboadi | 2 | https://github.com/pytorch/tutorials/pull/2406 | +| neuralninja27 | 2 | https://github.com/pytorch/tutorials/pull/2381 | +| akjalok | 2 | https://github.com/pytorch/tutorials/pull/2380 | +| tcNickolas | 2 | https://github.com/pytorch/tutorials/pull/2378 | +| Suhas-G | 2 | https://github.com/pytorch/tutorials/pull/2371 | +| BeniaminC | 2 | https://github.com/pytorch/tutorials/pull/2366 | +| ver2king | 2 | https://github.com/pytorch/tutorials/pull/2445, https://github.com/pytorch/tutorials/pull/2459 | +| mikgor | 1 | https://github.com/pytorch/tutorials/pull/2417 | +| spzala | 1 | https://github.com/pytorch/tutorials/pull/1579 | diff --git a/domains.rst b/domains.rst new file mode 100644 index 00000000000..fbc5fcdedd7 --- /dev/null +++ b/domains.rst @@ -0,0 +1,163 @@ +:orphan: + +Domains +======= + +This section contains specialized tutorials focused on applying +PyTorch to specific application areas. These guides demonstrate +how to use domain-specific libraries like torchvision, torchaudio, and +others. This section is for developers looking to implement PyTorch +in particular fields of deep learning. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add cards below this line +.. customcarditem:: + :header: TorchVision Object Detection Finetuning Tutorial + :card_description: Finetune a pre-trained Mask R-CNN model. + :image: _static/img/thumbnails/cropped/TorchVision-Object-Detection-Finetuning-Tutorial.png + :link: intermediate/torchvision_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: Transfer Learning for Computer Vision Tutorial + :card_description: Train a convolutional neural network for image classification using transfer learning. + :image: _static/img/thumbnails/cropped/Transfer-Learning-for-Computer-Vision-Tutorial.png + :link: beginner/transfer_learning_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: Adversarial Example Generation + :card_description: Train a convolutional neural network for image classification using transfer learning. + :image: _static/img/thumbnails/cropped/Adversarial-Example-Generation.png + :link: beginner/fgsm_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: DCGAN Tutorial + :card_description: Train a generative adversarial network (GAN) to generate new celebrities. + :image: _static/img/thumbnails/cropped/DCGAN-Tutorial.png + :link: beginner/dcgan_faces_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: Spatial Transformer Networks Tutorial + :card_description: Learn how to augment your network using a visual attention mechanism. + :image: _static/img/stn/Five.gif + :link: intermediate/spatial_transformer_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: Semi-Supervised Learning Tutorial Based on USB + :card_description: Learn how to train semi-supervised learning algorithms (on custom data) using USB and PyTorch. + :image: _static/img/usb_semisup_learn/code.png + :link: advanced/usb_semisup_learn.html + :tags: Image/Video + +.. Reinforcement Learning +.. customcarditem:: + :header: Reinforcement Learning (DQN) + :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym. + :image: _static/img/cartpole.gif + :link: intermediate/reinforcement_q_learning.html + :tags: Reinforcement-Learning + +.. customcarditem:: + :header: Reinforcement Learning (PPO) with TorchRL + :card_description: Learn how to use PyTorch and TorchRL to train a Proximal Policy Optimization agent on the Inverted Pendulum task from Gym. + :image: _static/img/invpendulum.gif + :link: intermediate/reinforcement_ppo.html + :tags: Reinforcement-Learning + +.. customcarditem:: + :header: Train a Mario-playing RL Agent + :card_description: Use PyTorch to train a Double Q-learning agent to play Mario. + :image: _static/img/mario.gif + :link: intermediate/mario_rl_tutorial.html + :tags: Reinforcement-Learning + +.. customcarditem:: + :header: Recurrent DQN + :card_description: Use TorchRL to train recurrent policies + :image: _static/img/rollout_recurrent.png + :link: intermediate/dqn_with_rnn_tutorial.html + :tags: Reinforcement-Learning + +.. customcarditem:: + :header: Code a DDPG Loss + :card_description: Use TorchRL to code a DDPG Loss + :image: _static/img/half_cheetah.gif + :link: advanced/coding_ddpg.html + :tags: Reinforcement-Learning + +.. customcarditem:: + :header: Writing your environment and transforms + :card_description: Use TorchRL to code a Pendulum + :image: _static/img/pendulum.gif + :link: advanced/pendulum.html + :tags: Reinforcement-Learning + +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 1 + :includehidden: + :hidden: + :caption: Image and Video + + intermediate/torchvision_tutorial + beginner/transfer_learning_tutorial + beginner/fgsm_tutorial + beginner/dcgan_faces_tutorial + intermediate/spatial_transformer_tutorial + +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Reinforcement Learning + + intermediate/reinforcement_q_learning + intermediate/reinforcement_ppo + intermediate/dqn_with_rnn_tutorial.html + intermediate/mario_rl_tutorial + advanced/pendulum + advanced/coding_ddpg.html + +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Recommendation Systems + + intermediate/torchrec_intro_tutorial + advanced/sharding + +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Other Domains + + See Audio tutorials on the audio website + See ExecuTorch tutorials on the ExecuTorch website diff --git a/ecosystem.rst b/ecosystem.rst new file mode 100644 index 00000000000..da2a926851a --- /dev/null +++ b/ecosystem.rst @@ -0,0 +1,70 @@ +Ecosystem +========= + +Explore tutorials that cover tools and frameworks in +the PyTorch ecosystem. These practical guides will help you leverage +PyTorch's extensive ecosystem for everything from experimentation +to production deployment. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add tutorial cards below this line +.. customcarditem:: + :header: Hyperparameter Tuning Tutorial + :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model. + :image: _static/img/ray-tune.png + :link: beginner/hyperparameter_tuning_tutorial.html + :tags: Model-Optimization,Best-Practice,Ecosystem + +.. customcarditem:: + :header: Multi-Objective Neural Architecture Search with Ax + :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency. + :image: _static/img/ax_logo.png + :link: intermediate/ax_multiobjective_nas_tutorial.html + :tags: Model-Optimization,Best-Practice,Ax,TorchX,Ecosystem + +.. customcarditem:: + :header: Performance Profiling in TensorBoard + :card_description: Learn how to use the TensorBoard plugin to profile and analyze your model's performance. + :image: _static/img/thumbnails/cropped/profiler.png + :link: intermediate/tensorboard_profiler_tutorial.html + :tags: Model-Optimization,Best-Practice,Profiling,TensorBoard,Ecosystem + +.. customcarditem:: + :header: Real Time Inference on Raspberry Pi 4 + :card_description: This tutorial covers how to run quantized and fused models on a Raspberry Pi 4 at 30 fps. + :image: _static/img/thumbnails/cropped/realtime_rpi.png + :link: intermediate/realtime_rpi.html + :tags: Model-Optimization,Image/Video,Quantization,Ecosystem + +.. End of tutorial card section +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 2 + :hidden: + + beginner/hyperparameter_tuning_tutorial + intermediate/ax_multiobjective_nas_tutorial + intermediate/tensorboard_profiler_tutorial + intermediate/realtime_rpi diff --git a/en-wordlist.txt b/en-wordlist.txt index b52d8374d3e..baf75d75ac0 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,6 +1,8 @@ ACL ADI +ALiBi AOT +AOTInductor APIs ATen AVX @@ -31,6 +33,7 @@ Captum Captum's CartPole Cayley +CharRNN Chatbots Chen Colab @@ -50,6 +53,7 @@ DDP DDPG DDQN DLRM +DMA DNN DQN DataLoaders @@ -60,20 +64,27 @@ DeepLabV DeepMind DeiT DenseNet +DeviceMesh Dialogs DyNet EOS EPS Ecker +ExecuTorch ExportDB FC FGSM +tensordict +DataLoader's FLAVA FSDP FX FX's FairSeq Fastpath +FakeTensor +FakeTensors +FFN FloydHub FloydHub's Frobenius @@ -100,6 +111,7 @@ HVP Hao HistoEnc HistoEncoder +HSDP Hugging Face IMDB IOT @@ -122,6 +134,7 @@ Kihyuk Kiuk Kubernetes Kuei +KV LRSchedulers LSTM LSTMs @@ -139,6 +152,7 @@ MKLDNN MLP MLPs MNIST +MPS MUC MacBook MacOS @@ -146,6 +160,8 @@ MaskRCNN Minifier MobileNet ModelABC +MPS +MTIA Mypy NAS NCCL @@ -156,6 +172,7 @@ NLP NTK NUMA NaN +NaNs NanoGPT Netron NeurIPS @@ -181,6 +198,7 @@ PIL's PPO PatchPredictor PennFudan +Perfetto Pixman Plotly Pohjonen @@ -219,10 +237,12 @@ STR SVE SciPy Sequentials +Sharding Sigmoid SoTA Sohn Spacy +SwiGLU TCP THP TIAToolbox @@ -254,6 +274,7 @@ VLDB VQA VS Code ViT +Volterra WMT WSI WSIs @@ -267,6 +288,7 @@ Xcode Xeon Yidong YouTube +Zipf accelerometer accuracies activations @@ -296,6 +318,7 @@ bbAP benchmarked benchmarking bitwise +bool boolean breakpoint broadcasted @@ -312,6 +335,7 @@ codegen colorbar compilable composable +composability concat conda config @@ -324,6 +348,7 @@ csv cuDNN cuda customizable +customizations datafile dataflow dataframe @@ -336,11 +361,11 @@ dataset’s deallocation decompositions decorrelated -devicemesh deserialize deserialized desynchronization deterministically +devicemesh dimensionality dir discontiguous @@ -349,6 +374,8 @@ downsample downsamples dropdown dtensor +dtype +dtypes duration elementwise embeddings @@ -359,6 +386,7 @@ enum eq equalities et +eval evaluateInput extensibility fastai @@ -368,14 +396,18 @@ fbgemm feedforward finetune finetuning +FlexAttention fp frontend functionalized +functionalizes +functionalization functorch fuser geomean globals grayscale +html handoff hardcode helpdesk @@ -384,6 +416,7 @@ hessian hessians histoencoder histologically +homonymous hotspot hvp hyperparameter @@ -395,6 +428,7 @@ inferencing initializations inlined interpretable +intra invariance io iter @@ -408,6 +442,7 @@ jpg json judgements jupyter +kernels keypoint kwargs labelled @@ -421,6 +456,7 @@ mAP macos manualSeed matmul +matmuls matplotlib memcpy memset @@ -436,6 +472,7 @@ modularized mpp mucosa multihead +MultiheadAttention multimodal multimodality multinode @@ -446,12 +483,17 @@ multithreading namespace natively ndarrays +nheads nightlies +NJT +NJTs +NJT's num numericalize numpy nvFuser nvFuser's +ok oneDNN opset optimizable @@ -459,6 +501,7 @@ optimizer's optimizers otsu overfitting +pageable parallelizable parallelization parametrization @@ -499,6 +542,7 @@ randint randn readably recomputation +reenable regressor reimplement reimplementing @@ -521,8 +565,8 @@ runtime runtime runtimes scalable +SDPA sharded -Sharding softmax sparsified sparsifier @@ -581,12 +625,16 @@ tradeoff tradeoffs triton uint +UX umap +unbacked uncomment uncommented underflowing unfused +unicode unimodal +unigram unnormalized unoptimized unparametrized @@ -608,5 +656,56 @@ warmstarted warmstarting warmup webp +wikitext wsi wsis +Meta's +RecSys +TorchRec +sharding +TBE +EBC +sharder +hyperoptimized +DMP +unsharded +lookups +KJTs +amongst +async +everytime +prototyped +GBs +HBM +gloo +nccl +Localhost +gpu +torchmetrics +url +colab +sharders +Criteo +torchrec +_batch_norm_impl_index +convolution_overrideable +aten +XPU +XPUs +impl +overrideable +TorchServe +Inductor’s +onwards +recompilations +BiasCorrection +ELU +GELU +NNCF +OpenVINO +OpenVINOQuantizer +PReLU +Quantizer +SmoothQuant +quantizer +quantizers \ No newline at end of file diff --git a/extension.rst b/extension.rst new file mode 100644 index 00000000000..ee4d4524418 --- /dev/null +++ b/extension.rst @@ -0,0 +1,106 @@ +:orphan: + +Extension +========= + +This section provides insights into extending PyTorch's capabilities. +It covers custom operations, frontend APIs, and advanced topics like +C++ extensions and dispatcher usage. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add tutorial cards below this line +.. customcarditem:: + :header: PyTorch Custom Operators Landing Page + :card_description: This is the landing page for all things related to custom operators in PyTorch. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/custom_ops_landing_page.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + +.. customcarditem:: + :header: Custom Python Operators + :card_description: Create Custom Operators in Python. Useful for black-boxing a Python function for use with torch.compile. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/python_custom_ops.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + +.. customcarditem:: + :header: Custom C++ and CUDA Operators + :card_description: How to extend PyTorch with custom C++ and CUDA operators. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/cpp_custom_ops.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + +.. customcarditem:: + :header: Custom Function Tutorial: Double Backward + :card_description: Learn how to write a custom autograd Function that supports double backward. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/custom_function_double_backward_tutorial.html + :tags: Extending-PyTorch,Frontend-APIs + +.. customcarditem:: + :header: Custom Function Tutorial: Fusing Convolution and Batch Norm + :card_description: Learn how to create a custom autograd Function that fuses batch norm into a convolution to improve memory usage. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/custom_function_conv_bn_tutorial.html + :tags: Extending-PyTorch,Frontend-APIs + +.. customcarditem:: + :header: Registering a Dispatched Operator in C++ + :card_description: The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like torch::add. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: advanced/dispatcher.html + :tags: Extending-PyTorch,Frontend-APIs,C++ + +.. customcarditem:: + :header: Extending Dispatcher For a New Backend in C++ + :card_description: Learn how to extend the dispatcher to add a new device living outside of the pytorch/pytorch repo and maintain it to keep in sync with native PyTorch devices. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: advanced/extend_dispatcher.html + :tags: Extending-PyTorch,Frontend-APIs,C++ + +.. customcarditem:: + :header: Facilitating New Backend Integration by PrivateUse1 + :card_description: Learn how to integrate a new backend living outside of the pytorch/pytorch repo and maintain it to keep in sync with the native PyTorch backend. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: advanced/privateuseone.html + :tags: Extending-PyTorch,Frontend-APIs,C++ + +.. End of tutorial card section +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Extending PyTorch + + advanced/custom_ops_landing_page + advanced/python_custom_ops + advanced/cpp_custom_ops + intermediate/custom_function_double_backward_tutorial + intermediate/custom_function_conv_bn_tutorial + advanced/cpp_extension + advanced/dispatcher + advanced/extend_dispatcher + advanced/privateuseone diff --git a/index.rst b/index.rst index a231be4dc22..5a5e80abfbb 100644 --- a/index.rst +++ b/index.rst @@ -3,11 +3,11 @@ Welcome to PyTorch Tutorials **What's new in PyTorch tutorials?** -* `Using User-Defined Triton Kernels with torch.compile `__ -* `Large Scale Transformer model training with Tensor Parallel (TP) `__ -* `Accelerating BERT with semi-structured (2:4) sparsity `__ -* `torch.export Tutorial with torch.export.Dim `__ -* `Extension points in nn.Module for load_state_dict and tensor subclasses `__ +* `Integrating Custom Operators with SYCL for Intel GPU `__ +* `Supporting Custom C++ Classes in torch.compile/torch.export `__ +* `Accelerating torch.save and torch.load with GPUDirect Storage `__ +* `Getting Started with Fully Sharded Data Parallel (FSDP2) `__ +* `Interactive Distributed Applications with Monarch `__ .. raw:: html @@ -25,7 +25,7 @@ Welcome to PyTorch Tutorials .. customcalloutitem:: :description: Bite-size, ready-to-deploy PyTorch code examples. :header: PyTorch Recipes - :button_link: recipes/recipes_index.html + :button_link: recipes_index.html :button_text: Explore Recipes .. End of callout item section @@ -69,7 +69,7 @@ Welcome to PyTorch Tutorials :header: Introduction to PyTorch on YouTube :card_description: An introduction to building a complete ML workflow with PyTorch. Follows the PyTorch Beginner Series on YouTube. :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: beginner/introyt.html + :link: beginner/introyt/introyt_index.html :tags: Getting-Started .. customcarditem:: @@ -93,6 +93,27 @@ Welcome to PyTorch Tutorials :link: intermediate/tensorboard_tutorial.html :tags: Interpretability,Getting-Started,TensorBoard +.. customcarditem:: + :header: Good usage of `non_blocking` and `pin_memory()` in PyTorch + :card_description: A guide on best practices to copy data from CPU to GPU. + :image: _static/img/pinmem.png + :link: intermediate/pinmem_nonblock.html + :tags: Getting-Started + +.. customcarditem:: + :header: Understanding requires_grad, retain_grad, Leaf, and Non-leaf Tensors + :card_description: Learn the subtleties of requires_grad, retain_grad, leaf, and non-leaf tensors + :image: _static/img/thumbnails/cropped/understanding_leaf_vs_nonleaf.png + :link: beginner/understanding_leaf_vs_nonleaf_tutorial.html + :tags: Getting-Started + +.. customcarditem:: + :header: Visualizing Gradients in PyTorch + :card_description: Visualize the gradient flow of a network. + :image: _static/img/thumbnails/cropped/visualizing_gradients_tutorial.png + :link: intermediate/visualizing_gradients_tutorial.html + :tags: Getting-Started + .. Image/Video .. customcarditem:: @@ -109,13 +130,6 @@ Welcome to PyTorch Tutorials :link: beginner/transfer_learning_tutorial.html :tags: Image/Video -.. customcarditem:: - :header: Optimizing Vision Transformer Model - :card_description: Apply cutting-edge, attention-based transformer models to computer vision tasks. - :image: _static/img/thumbnails/cropped/60-min-blitz.png - :link: beginner/vt_tutorial.html - :tags: Image/Video - .. customcarditem:: :header: Adversarial Example Generation :card_description: Train a convolutional neural network for image classification using transfer learning. @@ -137,12 +151,6 @@ Welcome to PyTorch Tutorials :link: intermediate/spatial_transformer_tutorial.html :tags: Image/Video -.. customcarditem:: - :header: Inference on Whole Slide Images with TIAToolbox - :card_description: Learn how to use the TIAToolbox to perform inference on whole slide images. - :image: _static/img/thumbnails/cropped/TIAToolbox-Tutorial.png - :link: intermediate/tiatoolbox_tutorial.html - :tags: Image/Video .. customcarditem:: :header: Semi-Supervised Learning Tutorial Based on USB @@ -223,72 +231,50 @@ Welcome to PyTorch Tutorials :link: intermediate/forced_alignment_with_torchaudio_tutorial.html :tags: Audio -.. Text - -.. customcarditem:: - :header: Fast Transformer Inference with Better Transformer - :card_description: Deploy a PyTorch Transformer model using Better Transformer with high performance for inference - :image: _static/img/thumbnails/cropped/pytorch-logo.png - :link: beginner/bettertransformer_tutorial.html - :tags: Production,Text +.. NLP .. customcarditem:: :header: NLP from Scratch: Classifying Names with a Character-level RNN :card_description: Build and train a basic character-level RNN to classify word from scratch without the use of torchtext. First in a series of three tutorials. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png :link: intermediate/char_rnn_classification_tutorial - :tags: Text + :tags: NLP .. customcarditem:: :header: NLP from Scratch: Generating Names with a Character-level RNN :card_description: After using character-level RNN to classify names, learn how to generate names from languages. Second in a series of three tutorials. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png :link: intermediate/char_rnn_generation_tutorial.html - :tags: Text + :tags: NLP .. customcarditem:: :header: NLP from Scratch: Translation with a Sequence-to-sequence Network and Attention :card_description: This is the third and final tutorial on doing “NLP From Scratch”, where we write our own classes and functions to preprocess the data to do our NLP modeling tasks. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png :link: intermediate/seq2seq_translation_tutorial.html - :tags: Text - -.. customcarditem:: - :header: Text Classification with Torchtext - :card_description: Learn how to build the dataset and classify text using torchtext library. - :image: _static/img/thumbnails/cropped/Text-Classification-with-TorchText.png - :link: beginner/text_sentiment_ngrams_tutorial.html - :tags: Text - -.. customcarditem:: - :header: Language Translation with Transformer - :card_description: Train a language translation model from scratch using Transformer. - :image: _static/img/thumbnails/cropped/Language-Translation-with-TorchText.png - :link: beginner/translation_transformer.html - :tags: Text - -.. customcarditem:: - :header: Pre-process custom text dataset using Torchtext - :card_description: Learn how to use torchtext to prepare a custom dataset - :image: _static/img/thumbnails/cropped/torch_text_logo.png - :link: beginner/torchtext_custom_dataset_tutorial.html - :tags: Text - + :tags: NLP .. ONNX .. customcarditem:: - :header: (optional) Exporting a PyTorch model to ONNX using TorchDynamo backend and Running it using ONNX Runtime + :header: Exporting a PyTorch model to ONNX using TorchDynamo backend and Running it using ONNX Runtime :card_description: Build a image classifier model in PyTorch and convert it to ONNX before deploying it with ONNX Runtime. :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png :link: beginner/onnx/export_simple_model_to_onnx_tutorial.html :tags: Production,ONNX,Backends .. customcarditem:: - :header: Introduction to ONNX Registry - :card_description: Demonstrate end-to-end how to address unsupported operators by using ONNX Registry. + :header: Extending the ONNX exporter operator support + :card_description: Demonstrate end-to-end how to address unsupported operators in ONNX. :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png - :link: advanced/onnx_registry_tutorial.html + :link: beginner/onnx/onnx_registry_tutorial.html + :tags: Production,ONNX,Backends + +.. customcarditem:: + :header: Exporting a model with control flow to ONNX + :card_description: Demonstrate how to handle control flow logic while exporting a PyTorch model to ONNX. + :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png + :link: beginner/onnx/export_control_flow_model_to_onnx_tutorial.html :tags: Production,ONNX,Backends .. Reinforcement Learning @@ -337,38 +323,10 @@ Welcome to PyTorch Tutorials .. Deploying PyTorch Models in Production - -.. customcarditem:: - :header: Deploying PyTorch in Python via a REST API with Flask - :card_description: Deploy a PyTorch model using Flask and expose a REST API for model inference using the example of a pretrained DenseNet 121 model which detects the image. - :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png - :link: intermediate/flask_rest_api_tutorial.html - :tags: Production - -.. customcarditem:: - :header: Introduction to TorchScript - :card_description: Introduction to TorchScript, an intermediate representation of a PyTorch model (subclass of nn.Module) that can then be run in a high-performance environment such as C++. - :image: _static/img/thumbnails/cropped/Introduction-to-TorchScript.png - :link: beginner/Intro_to_TorchScript_tutorial.html - :tags: Production,TorchScript - -.. customcarditem:: - :header: Loading a TorchScript Model in C++ - :card_description: Learn how PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++, with no dependency on Python. - :image: _static/img/thumbnails/cropped/Loading-a-TorchScript-Model-in-Cpp.png - :link: advanced/cpp_export.html - :tags: Production,TorchScript - -.. customcarditem:: - :header: (optional) Exporting a PyTorch Model to ONNX using TorchScript backend and Running it using ONNX Runtime - :card_description: Convert a model defined in PyTorch into the ONNX format and then run it with ONNX Runtime. - :image: _static/img/thumbnails/cropped/optional-Exporting-a-Model-from-PyTorch-to-ONNX-and-Running-it-using-ONNX-Runtime.png - :link: advanced/super_resolution_with_onnxruntime.html - :tags: Production,ONNX - .. customcarditem:: :header: Profiling PyTorch :card_description: Learn how to profile a PyTorch application + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png :link: beginner/profiler.html :tags: Profiling @@ -386,13 +344,6 @@ Welcome to PyTorch Tutorials .. Code Transformations with FX -.. customcarditem:: - :header: Building a Convolution/Batch Norm fuser in FX - :card_description: Build a simple FX pass that fuses batch norm into convolution to improve performance during inference. - :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png - :link: intermediate/fx_conv_bn_fuser.html - :tags: FX - .. customcarditem:: :header: Building a Simple Performance Profiler with FX :card_description: Build a simple FX interpreter to record the runtime of op, module, and function calls and report statistics @@ -417,19 +368,26 @@ Welcome to PyTorch Tutorials :tags: Frontend-APIs,C++ .. customcarditem:: - :header: Python Custom Operators Landing Page + :header: PyTorch Custom Operators Landing Page :card_description: This is the landing page for all things related to custom operators in PyTorch. :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png :link: advanced/custom_ops_landing_page.html :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA .. customcarditem:: - :header: Python Custom Operators + :header: Custom Python Operators :card_description: Create Custom Operators in Python. Useful for black-boxing a Python function for use with torch.compile. :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png :link: advanced/python_custom_ops.html :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA +.. customcarditem:: + :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` + :card_description: Learn how to use compiled autograd to capture a larger backward graph. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/compiled_autograd_tutorial + :tags: Model-Optimization,CUDA + .. customcarditem:: :header: Custom C++ and CUDA Operators :card_description: How to extend PyTorch with custom C++ and CUDA operators. @@ -437,44 +395,9 @@ Welcome to PyTorch Tutorials :link: advanced/cpp_custom_ops.html :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA -.. customcarditem:: - :header: Custom C++ and CUDA Extensions - :card_description: Create a neural network layer with no parameters using numpy. Then use scipy to create a neural network layer that has learnable weights. - :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png - :link: advanced/cpp_extension.html - :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA - -.. customcarditem:: - :header: Extending TorchScript with Custom C++ Operators - :card_description: Implement a custom TorchScript operator in C++, how to build it into a shared library, how to use it in Python to define TorchScript models and lastly how to load it into a C++ application for inference workloads. - :image: _static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png - :link: advanced/torch_script_custom_ops.html - :tags: Extending-PyTorch,Frontend-APIs,TorchScript,C++ - -.. customcarditem:: - :header: Extending TorchScript with Custom C++ Classes - :card_description: This is a continuation of the custom operator tutorial, and introduces the API we’ve built for binding C++ classes into TorchScript and Python simultaneously. - :image: _static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Classes.png - :link: advanced/torch_script_custom_classes.html - :tags: Extending-PyTorch,Frontend-APIs,TorchScript,C++ - -.. customcarditem:: - :header: Dynamic Parallelism in TorchScript - :card_description: This tutorial introduces the syntax for doing *dynamic inter-op parallelism* in TorchScript. - :image: _static/img/thumbnails/cropped/TorchScript-Parallelism.jpg - :link: advanced/torch-script-parallelism.html - :tags: Frontend-APIs,TorchScript,C++ - -.. customcarditem:: - :header: Real Time Inference on Raspberry Pi 4 - :card_description: This tutorial covers how to run quantized and fused models on a Raspberry Pi 4 at 30 fps. - :image: _static/img/thumbnails/cropped/realtime_rpi.png - :link: intermediate/realtime_rpi.html - :tags: TorchScript,Model-Optimization,Image/Video,Quantization - .. customcarditem:: :header: Autograd in C++ Frontend - :card_description: The autograd package helps build flexible and dynamic nerural netorks. In this tutorial, exploreseveral examples of doing autograd in PyTorch C++ frontend + :card_description: The autograd package helps build flexible and dynamic neural netorks. In this tutorial, explore several examples of doing autograd in PyTorch C++ frontend :image: _static/img/thumbnails/cropped/Autograd-in-Cpp-Frontend.png :link: advanced/cpp_autograd.html :tags: Frontend-APIs,C++ @@ -600,48 +523,6 @@ Welcome to PyTorch Tutorials :link: advanced/semi_structured_sparse.html :tags: Text,Model-Optimization -.. customcarditem:: - :header: (beta) Dynamic Quantization on an LSTM Word Language Model - :card_description: Apply dynamic quantization, the easiest form of quantization, to a LSTM-based next word prediction model. - :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png - :link: advanced/dynamic_quantization_tutorial.html - :tags: Text,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Dynamic Quantization on BERT - :card_description: Apply the dynamic quantization on a BERT (Bidirectional Embedding Representations from Transformers) model. - :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png - :link: intermediate/dynamic_quantization_bert_tutorial.html - :tags: Text,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Quantized Transfer Learning for Computer Vision Tutorial - :card_description: Extends the Transfer Learning for Computer Vision Tutorial using a quantized model. - :image: _static/img/thumbnails/cropped/60-min-blitz.png - :link: intermediate/quantized_transfer_learning_tutorial.html - :tags: Image/Video,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Static Quantization with Eager Mode in PyTorch - :card_description: This tutorial shows how to do post-training static quantization. - :image: _static/img/thumbnails/cropped/60-min-blitz.png - :link: advanced/static_quantization_tutorial.html - :tags: Quantization - -.. customcarditem:: - :header: Grokking PyTorch Intel CPU Performance from First Principles - :card_description: A case study on the TorchServe inference framework optimized with Intel® Extension for PyTorch. - :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: intermediate/torchserve_with_ipex - :tags: Model-Optimization,Production - -.. customcarditem:: - :header: Grokking PyTorch Intel CPU Performance from First Principles (Part 2) - :card_description: A case study on the TorchServe inference framework optimized with Intel® Extension for PyTorch (Part 2). - :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: intermediate/torchserve_with_ipex_2 - :tags: Model-Optimization,Production - .. customcarditem:: :header: Multi-Objective Neural Architecture Search with Ax :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency. @@ -656,6 +537,20 @@ Welcome to PyTorch Tutorials :link: intermediate/torch_compile_tutorial.html :tags: Model-Optimization +.. customcarditem:: + :header: torch.compile End-to-End Tutorial + :card_description: An example of applying torch.compile to a real model, demonstrating speedups. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/torch_compile_full_example.html + :tags: Model-Optimization + +.. customcarditem:: + :header: Building a Convolution/Batch Norm fuser in torch.compile + :card_description: Build a simple pattern matcher pass that fuses batch norm into convolution to improve performance during inference. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/torch_compile_conv_bn_fuser.html + :tags: Model-Optimization + .. customcarditem:: :header: Inductor CPU Backend Debugging and Profiling :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend. @@ -677,6 +572,14 @@ Welcome to PyTorch Tutorials :link: beginner/knowledge_distillation_tutorial.html :tags: Model-Optimization,Image/Video + +.. customcarditem:: + :header: Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile() + :card_description: This tutorial goes over recommended best practices for implementing Transformers with native PyTorch. + :image: _static/img/thumbnails/cropped/pytorch-logo.png + :link: intermediate/transformer_building_blocks.html + :tags: Transformer + .. Parallel-and-Distributed-Training @@ -766,17 +669,32 @@ Welcome to PyTorch Tutorials :tags: Parallel-and-Distributed-Training .. customcarditem:: - :header: Getting Started with Fully Sharded Data Parallel(FSDP) - :card_description: Learn how to train models with Fully Sharded Data Parallel package. + :header: Getting Started with Fully Sharded Data Parallel (FSDP2) + :card_description: Learn how to train models with Fully Sharded Data Parallel (fully_shard) package. :image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png :link: intermediate/FSDP_tutorial.html :tags: Parallel-and-Distributed-Training .. customcarditem:: - :header: Advanced Model Training with Fully Sharded Data Parallel (FSDP) - :card_description: Explore advanced model training with Fully Sharded Data Parallel package. - :image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png - :link: intermediate/FSDP_adavnced_tutorial.html + :header: Introduction to Libuv TCPStore Backend + :card_description: TCPStore now uses a new server backend for faster connection and better scalability. + :image: _static/img/thumbnails/cropped/Introduction-to-Libuv-Backend-TCPStore.png + :link: intermediate/TCPStore_libuv_backend.html + :tags: Parallel-and-Distributed-Training + +.. customcarditem:: + :header: Interactive Distributed Applications with Monarch + :card_description: Learn how to spin up distributed applications using Monarch's singler controller model + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/monarch_distributed_tutorial.html + :tags: Parallel-and-Distributed-Training + + +.. customcarditem:: + :header: Interactive Distributed Applications with Monarch + :card_description: Learn how to use Monarch's actor framework with TorchTitan to simplify large-scale distributed training across SLURM clusters. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/monarch_distributed_tutorial.html :tags: Parallel-and-Distributed-Training .. Edge @@ -799,21 +717,21 @@ Welcome to PyTorch Tutorials :header: Using the ExecuTorch SDK to Profile a Model :card_description: Explore how to use the ExecuTorch SDK to profile, debug, and visualize ExecuTorch models :image: _static/img/ExecuTorch-Logo-cropped.svg - :link: https://pytorch.org/executorch/stable/tutorials/sdk-integration-tutorial.html + :link: https://docs.pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html :tags: Edge .. customcarditem:: :header: Building an ExecuTorch iOS Demo App :card_description: Explore how to set up the ExecuTorch iOS Demo App, which uses the MobileNet v3 model to process live camera images leveraging three different backends: XNNPACK, Core ML, and Metal Performance Shaders (MPS). :image: _static/img/ExecuTorch-Logo-cropped.svg - :link: https://pytorch.org/executorch/stable/demo-apps-ios.html + :link: https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo :tags: Edge .. customcarditem:: :header: Building an ExecuTorch Android Demo App :card_description: Learn how to set up the ExecuTorch Android Demo App for image segmentation tasks using the DeepLab v3 model and XNNPACK FP32 backend. :image: _static/img/ExecuTorch-Logo-cropped.svg - :link: https://pytorch.org/executorch/stable/demo-apps-android.html + :link: https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app :tags: Edge .. customcarditem:: @@ -830,7 +748,7 @@ Welcome to PyTorch Tutorials :header: Introduction to TorchRec :card_description: TorchRec is a PyTorch domain library built to provide common sparsity & parallelism primitives needed for large-scale recommender systems. :image: _static/img/thumbnails/torchrec.png - :link: intermediate/torchrec_tutorial.html + :link: intermediate/torchrec_intro_tutorial.html :tags: TorchRec,Recommender .. customcarditem:: @@ -840,15 +758,6 @@ Welcome to PyTorch Tutorials :link: advanced/sharding.html :tags: TorchRec,Recommender -.. Multimodality - -.. customcarditem:: - :header: Introduction to TorchMultimodal - :card_description: TorchMultimodal is a library that provides models, primitives and examples for training multimodal tasks - :image: _static/img/thumbnails/torchrec.png - :link: beginner/flava_finetuning_tutorial.html - :tags: TorchMultimodal - .. End of tutorial card section @@ -881,18 +790,6 @@ Additional Resources :button_link: https://pytorch.org/examples?utm_source=examples&utm_medium=examples-landing :button_text: Check Out Examples -.. customcalloutitem:: - :header: PyTorch Cheat Sheet - :description: Quick overview to essential PyTorch elements. - :button_link: beginner/ptcheat.html - :button_text: Open - -.. customcalloutitem:: - :header: Tutorials on GitHub - :description: Access PyTorch Tutorials from GitHub. - :button_link: https://github.com/pytorch/tutorials - :button_text: Go To GitHub - .. customcalloutitem:: :header: Run Tutorials on Google Colab :description: Learn how to copy tutorial data into Google Drive so that you can run tutorials on Google Colab. @@ -912,263 +809,56 @@ Additional Resources .. Page TOC .. ----------------------------------------- .. toctree:: - :maxdepth: 2 + :glob: + :maxdepth: 1 :hidden: - :includehidden: - :caption: PyTorch Recipes - See All Recipes - See All Prototype Recipes + intro .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :hidden: - :includehidden: - :caption: Introduction to PyTorch - - beginner/basics/intro - beginner/basics/quickstart_tutorial - beginner/basics/tensorqs_tutorial - beginner/basics/data_tutorial - beginner/basics/transforms_tutorial - beginner/basics/buildmodel_tutorial - beginner/basics/autogradqs_tutorial - beginner/basics/optimization_tutorial - beginner/basics/saveloadrun_tutorial - advanced/custom_ops_landing_page -.. toctree:: - :maxdepth: 2 - :hidden: - :includehidden: - :caption: Introduction to PyTorch on YouTube - - beginner/introyt - beginner/introyt/introyt1_tutorial - beginner/introyt/tensors_deeper_tutorial - beginner/introyt/autogradyt_tutorial - beginner/introyt/modelsyt_tutorial - beginner/introyt/tensorboardyt_tutorial - beginner/introyt/trainingyt - beginner/introyt/captumyt + compilers_index .. toctree:: :maxdepth: 2 :hidden: - :includehidden: - :caption: Learning PyTorch - beginner/deep_learning_60min_blitz - beginner/pytorch_with_examples - beginner/nn_tutorial - intermediate/tensorboard_tutorial + domains -.. toctree:: - :maxdepth: 2 - :includehidden: +.. toctree:: 1 :hidden: - :caption: Image and Video - - intermediate/torchvision_tutorial - beginner/transfer_learning_tutorial - beginner/fgsm_tutorial - beginner/dcgan_faces_tutorial - intermediate/spatial_transformer_tutorial - beginner/vt_tutorial - intermediate/tiatoolbox_tutorial - -.. toctree:: :maxdepth: 2 - :includehidden: - :hidden: - :caption: Audio - - beginner/audio_io_tutorial - beginner/audio_resampling_tutorial - beginner/audio_data_augmentation_tutorial - beginner/audio_feature_extractions_tutorial - beginner/audio_feature_augmentation_tutorial - beginner/audio_datasets_tutorial - intermediate/speech_recognition_pipeline_tutorial - intermediate/speech_command_classification_with_torchaudio_tutorial - intermediate/text_to_speech_with_torchaudio - intermediate/forced_alignment_with_torchaudio_tutorial -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Text - - beginner/bettertransformer_tutorial - intermediate/char_rnn_classification_tutorial - intermediate/char_rnn_generation_tutorial - intermediate/seq2seq_translation_tutorial - beginner/text_sentiment_ngrams_tutorial - beginner/translation_transformer - beginner/torchtext_custom_dataset_tutorial - - -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Backends - - beginner/onnx/intro_onnx - -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Reinforcement Learning - - intermediate/reinforcement_q_learning - intermediate/reinforcement_ppo - intermediate/mario_rl_tutorial - advanced/pendulum + distributed .. toctree:: - :maxdepth: 2 - :includehidden: + :maxdepth: 1 :hidden: - :caption: Deploying PyTorch Models in Production - beginner/onnx/intro_onnx - intermediate/flask_rest_api_tutorial - beginner/Intro_to_TorchScript_tutorial - advanced/cpp_export - advanced/super_resolution_with_onnxruntime - intermediate/realtime_rpi + deep-dive .. toctree:: - :maxdepth: 2 - :includehidden: + :maxdepth: 1 :hidden: - :caption: Profiling PyTorch - beginner/profiler - beginner/hta_intro_tutorial - beginner/hta_trace_diff_tutorial + extension .. toctree:: - :maxdepth: 2 - :includehidden: + :maxdepth: 1 :hidden: - :caption: Code Transforms with FX - intermediate/fx_conv_bn_fuser - intermediate/fx_profiling_tutorial + ecosystem .. toctree:: - :maxdepth: 2 - :includehidden: + :maxdepth: 1 :hidden: - :caption: Frontend APIs - - intermediate/memory_format_tutorial - intermediate/forward_ad_usage - intermediate/jacobians_hessians - intermediate/ensembling - intermediate/per_sample_grads - intermediate/neural_tangent_kernels.py - advanced/cpp_frontend - advanced/torch-script-parallelism - advanced/cpp_autograd -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Extending PyTorch - - advanced/custom_ops_landing_page - advanced/python_custom_ops - advanced/cpp_custom_ops - intermediate/custom_function_double_backward_tutorial - intermediate/custom_function_conv_bn_tutorial - advanced/cpp_extension - advanced/torch_script_custom_ops - advanced/torch_script_custom_classes - advanced/dispatcher - advanced/extend_dispatcher - advanced/privateuseone + recipes_index .. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Model Optimization - - beginner/profiler - intermediate/tensorboard_profiler_tutorial - beginner/hyperparameter_tuning_tutorial - beginner/vt_tutorial - intermediate/parametrizations - intermediate/pruning_tutorial - advanced/dynamic_quantization_tutorial - intermediate/dynamic_quantization_bert_tutorial - intermediate/quantized_transfer_learning_tutorial - advanced/static_quantization_tutorial - intermediate/torchserve_with_ipex - intermediate/torchserve_with_ipex_2 - intermediate/nvfuser_intro_tutorial - intermediate/ax_multiobjective_nas_tutorial - intermediate/torch_compile_tutorial - intermediate/inductor_debug_cpu - intermediate/scaled_dot_product_attention_tutorial - beginner/knowledge_distillation_tutorial - - -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Parallel and Distributed Training - - distributed/home - beginner/dist_overview - beginner/ddp_series_intro - intermediate/model_parallel_tutorial - intermediate/ddp_tutorial - intermediate/dist_tuto - intermediate/FSDP_tutorial - intermediate/FSDP_adavnced_tutorial - intermediate/TP_tutorial - intermediate/pipelining_tutorial - intermediate/process_group_cpp_extension_tutorial - intermediate/rpc_tutorial - intermediate/rpc_param_server_tutorial - intermediate/rpc_async_execution - advanced/rpc_ddp_tutorial - advanced/generic_join - -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Edge with ExecuTorch - - Exporting to ExecuTorch Tutorial - Running an ExecuTorch Model in C++ Tutorial < https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html> - Using the ExecuTorch SDK to Profile a Model - Building an ExecuTorch iOS Demo App - Building an ExecuTorch Android Demo App - Lowering a Model as a Delegate - -.. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Recommendation Systems - - intermediate/torchrec_tutorial - advanced/sharding - -.. toctree:: - :maxdepth: 2 - :includehidden: + :maxdepth: 1 :hidden: - :caption: Multimodality - beginner/flava_finetuning_tutorial + prototype/prototype_index diff --git a/intermediate_source/FSDP1_tutorial.rst b/intermediate_source/FSDP1_tutorial.rst new file mode 100644 index 00000000000..b983879a449 --- /dev/null +++ b/intermediate_source/FSDP1_tutorial.rst @@ -0,0 +1,448 @@ +Getting Started with Fully Sharded Data Parallel(FSDP) +====================================================== + +**Author**: `Hamid Shojanazeri `__, `Yanli Zhao `__, `Shen Li `__ + +.. note:: + FSDP1 is deprecated. Please check out `FSDP2 tutorial `_. + +Training AI models at a large scale is a challenging task that requires a lot of compute power and resources. +It also comes with considerable engineering complexity to handle the training of these very large models. +`PyTorch FSDP `__, released in PyTorch 1.11 makes this easier. + +In this tutorial, we show how to use `FSDP APIs `__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models `__, +`GPT 3 models up to 1T parameters `__ . The sample DDP MNIST code courtesy of `Patrick Hu `_. + + +How FSDP works +-------------- +In `DistributedDataParallel `__, (DDP) training, each process/ worker owns a replica of the model and processes a batch of data, finally it uses all-reduce to sum up gradients over different workers. In DDP the model weights and optimizer states are replicated across all workers. FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks. + +When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation. + +.. figure:: /_static/img/distributed/fsdp_workflow.png + :width: 100% + :align: center + :alt: FSDP workflow + + FSDP Workflow + +At a high level FSDP works as follow: + +*In constructor* + +* Shard model parameters and each rank only keeps its own shard + +*In forward path* + +* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit +* Run forward computation +* Discard parameter shards it has just collected + +*In backward path* + +* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit +* Run backward computation +* Run reduce_scatter to sync gradients +* Discard parameters. + +One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards. + +.. figure:: /_static/img/distributed/fsdp_sharding.png + :width: 100% + :align: center + :alt: FSDP allreduce + + FSDP Allreduce + +How to use FSDP +--------------- +Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. + +*Setup* + +1.1 Install PyTorch along with Torchvision + +See the `Get Started guide `__ for information on installation. + +We add the following code snippets to a python script “FSDP_mnist.py”. + +1.2 Import necessary packages + +.. note:: + This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`. + +.. code-block:: python + + # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py + import os + import argparse + import functools + import torch + import torch.nn as nn + import torch.nn.functional as F + import torch.optim as optim + from torchvision import datasets, transforms + + + from torch.optim.lr_scheduler import StepLR + + import torch.distributed as dist + import torch.multiprocessing as mp + from torch.nn.parallel import DistributedDataParallel as DDP + from torch.utils.data.distributed import DistributedSampler + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp.fully_sharded_data_parallel import ( + CPUOffload, + BackwardPrefetch, + ) + from torch.distributed.fsdp.wrap import ( + size_based_auto_wrap_policy, + enable_wrap, + wrap, + ) + +1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up. + +.. code-block:: python + + def setup(rank, world_size): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + + # initialize the process group + dist.init_process_group("nccl", rank=rank, world_size=world_size) + + def cleanup(): + dist.destroy_process_group() + +2.1 Define our toy model for handwritten digit classification. + +.. code-block:: python + + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + +2.2 Define a train function + +.. code-block:: python + + def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): + model.train() + ddp_loss = torch.zeros(2).to(rank) + if sampler: + sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(rank), target.to(rank) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target, reduction='sum') + loss.backward() + optimizer.step() + ddp_loss[0] += loss.item() + ddp_loss[1] += len(data) + + dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) + if rank == 0: + print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) + +2.3 Define a validation function + +.. code-block:: python + + def test(model, rank, world_size, test_loader): + model.eval() + correct = 0 + ddp_loss = torch.zeros(3).to(rank) + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(rank), target.to(rank) + output = model(data) + ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() + ddp_loss[2] += len(data) + + dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) + + if rank == 0: + test_loss = ddp_loss[0] / ddp_loss[2] + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( + test_loss, int(ddp_loss[1]), int(ddp_loss[2]), + 100. * ddp_loss[1] / ddp_loss[2])) + +2.4 Define a distributed train function that wraps the model in FSDP + +**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states.** + +.. code-block:: python + + def fsdp_main(rank, world_size, args): + setup(rank, world_size) + + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + dataset1 = datasets.MNIST('../data', train=True, download=True, + transform=transform) + dataset2 = datasets.MNIST('../data', train=False, + transform=transform) + + sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) + sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) + + train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} + test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} + cuda_kwargs = {'num_workers': 2, + 'pin_memory': True, + 'shuffle': False} + train_kwargs.update(cuda_kwargs) + test_kwargs.update(cuda_kwargs) + + train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) + test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) + my_auto_wrap_policy = functools.partial( + size_based_auto_wrap_policy, min_num_params=100 + ) + torch.cuda.set_device(rank) + + + init_start_event = torch.cuda.Event(enable_timing=True) + init_end_event = torch.cuda.Event(enable_timing=True) + + model = Net().to(rank) + + model = FSDP(model) + + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + init_start_event.record() + for epoch in range(1, args.epochs + 1): + train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) + test(model, rank, world_size, test_loader) + scheduler.step() + + init_end_event.record() + + if rank == 0: + init_end_event.synchronize() + print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") + print(f"{model}") + + if args.save_model: + # use a barrier to make sure training is done on all ranks + dist.barrier() + states = model.state_dict() + if rank == 0: + torch.save(states, "mnist_cnn.pt") + + cleanup() + + + +2.5 Finally, parse the arguments and set the main function + +.. code-block:: python + + if __name__ == '__main__': + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 14)') + parser.add_argument('--lr', type=float, default=1.0, metavar='LR', + help='learning rate (default: 1.0)') + parser.add_argument('--gamma', type=float, default=0.7, metavar='M', + help='Learning rate step gamma (default: 0.7)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') + args = parser.parse_args() + + torch.manual_seed(args.seed) + + WORLD_SIZE = torch.cuda.device_count() + mp.spawn(fsdp_main, + args=(WORLD_SIZE, args), + nprocs=WORLD_SIZE, + join=True) + + +We have recorded cuda events to measure the time of FSDP model specifics. The CUDA event time was 110.85 seconds. + +.. code-block:: bash + + python FSDP_mnist.py + + CUDA event elapsed time on training loop 40.67462890625sec + +Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit. +Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences. + +.. code-block:: bash + + FullyShardedDataParallel( + (_fsdp_wrapped_module): FlattenParamsWrapper( + (_fpw_module): Net( + (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) + (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) + (dropout1): Dropout(p=0.25, inplace=False) + (dropout2): Dropout(p=0.5, inplace=False) + (fc1): Linear(in_features=9216, out_features=128, bias=True) + (fc2): Linear(in_features=128, out_features=10, bias=True) + ) + ) + ) + +The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. + + +.. figure:: /_static/img/distributed/FSDP_memory.gif + :width: 100% + :align: center + :alt: FSDP peak memory + + FSDP Peak Memory Usage + +Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. +The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model. +In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding. +Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers. + +To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit). +In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers. +Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100. + + +To do so in 2.4 we define the auto_wrap_policy and pass it to FSDP wrapper, in the following example, my_auto_wrap_policy defines that a layer could be wrapped or sharded by FSDP if the number of parameters in this layer is larger than 100. +If the number of parameters in this layer is smaller than 100, it will be wrapped with other small layers together by FSDP. +Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning for this config in the future. Without an auto tuning tool, it is good to profile your workflow using different auto wrap policies experimentally and find the optimal one. + +.. code-block:: python + + my_auto_wrap_policy = functools.partial( + size_based_auto_wrap_policy, min_num_params=20000 + ) + torch.cuda.set_device(rank) + model = Net().to(rank) + + model = FSDP(model, + auto_wrap_policy=my_auto_wrap_policy) + +Applying the auto_wrap_policy, the model would be as follows: + +.. code-block:: bash + + FullyShardedDataParallel( + (_fsdp_wrapped_module): FlattenParamsWrapper( + (_fpw_module): Net( + (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) + (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) + (dropout1): Dropout(p=0.25, inplace=False) + (dropout2): Dropout(p=0.5, inplace=False) + (fc1): FullyShardedDataParallel( + (_fsdp_wrapped_module): FlattenParamsWrapper( + (_fpw_module): Linear(in_features=9216, out_features=128, bias=True) + ) + ) + (fc2): Linear(in_features=128, out_features=10, bias=True) + ) + ) + + +.. code-block:: bash + + python FSDP_mnist.py + + CUDA event elapsed time on training loop 41.89130859375sec + +The following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. +It can be observed that the peak memory usage on each device is smaller compared to FSDP without auto wrap policy applied, from ~75 MB to 66 MB. + +.. figure:: /_static/img/distributed/FSDP_autowrap.gif + :width: 100% + :align: center + :alt: FSDP peak memory + + FSDP Peak Memory Usage using Auto_wrap policy + +*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into GPUs, then CPU offload can be helpful here. + +Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in cpu_offload=CPUOffload(offload_params=True). + +Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. The default is None in which case there will be no offloading. + +Using this feature may slow down the training considerably, due to frequent copying of tensors from host to device, but it could help improve memory efficiency and train larger scale models. + +In 2.4 we just add it to the FSDP wrapper + + +.. code-block:: python + + model = FSDP(model, + auto_wrap_policy=my_auto_wrap_policy, + cpu_offload=CPUOffload(offload_params=True)) + + +Compare it with DDP, if in 2.4 we just normally wrap the model in DPP, saving the changes in “DDP_mnist.py”. + +.. code-block:: python + + model = Net().to(rank) + model = DDP(model) + + +.. code-block:: bash + + python DDP_mnist.py + + CUDA event elapsed time on training loop 39.77766015625sec + +The following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch profiler. + +.. figure:: /_static/img/distributed/DDP_memory.gif + :width: 100% + :align: center + :alt: FSDP peak memory + + DDP Peak Memory Usage using Auto_wrap policy + + +Considering the toy example and tiny MNIST model we defined here, we can observe the difference between peak memory usage of DDP and FSDP. +In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP which shards the model parameters, optimizer states and gradients over DDP ranks. +The peak memory usage using FSDP with auto_wrap policy is the lowest followed by FSDP and DDP. + +Also, looking at timings, considering the small model and running the training on a single machine, FSDP with and without auto_wrap policy performed almost as fast as DDP. +This example does not represent most of the real applications, for detailed analysis and comparison between DDP and FSDP please refer to this `blog post `__ . diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_advanced_tutorial.rst similarity index 93% rename from intermediate_source/FSDP_adavnced_tutorial.rst rename to intermediate_source/FSDP_advanced_tutorial.rst index 5a0cb5376da..bf22e6efb50 100644 --- a/intermediate_source/FSDP_adavnced_tutorial.rst +++ b/intermediate_source/FSDP_advanced_tutorial.rst @@ -6,6 +6,23 @@ Wright `__, `Rohan Varma `__, `Yanli Zhao `__ +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * PyTorch's Fully Sharded Data Parallel Module: A wrapper for sharding module parameters across + data parallel workers. + + + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 1.12 or later + * Read about the `FSDP API `__. + This tutorial introduces more advanced features of Fully Sharded Data Parallel (FSDP) as part of the PyTorch 1.12 release. To get familiar with FSDP, please @@ -13,18 +30,20 @@ refer to the `FSDP getting started tutorial `__. In this tutorial, we fine-tune a HuggingFace (HF) T5 model with FSDP for text -summarization as a working example. +summarization as a working example. The example uses Wikihow and for simplicity, we will showcase the training on a -single node, P4dn instance with 8 A100 GPUs. We will soon have a blog post on -large scale FSDP training on a multi-node cluster, please stay tuned for that on -the PyTorch medium channel. +single node, P4dn instance with 8 A100 GPUs. We now have several blog posts ( +`(link1), `__ +`(link2) `__) +and a `paper `__ on +large scale FSDP training on a multi-node cluster. FSDP is a production ready package with focus on ease of use, performance, and long-term support. One of the main benefits of FSDP is reducing the memory footprint on each GPU. This enables training of larger models with lower total memory vs DDP, and leverages the overlap of computation and communication to -train models efficiently. +train models efficiently. This reduced memory pressure can be leveraged to either train larger models or increase batch size, potentially helping overall training throughput. You can read more about PyTorch FSDP `here @@ -47,21 +66,21 @@ Recap on How FSDP Works At a high level FDSP works as follow: -*In constructor* +*In the constructor* * Shard model parameters and each rank only keeps its own shard -*In forward pass* +*In the forward pass* * Run `all_gather` to collect all shards from all ranks to recover the full - parameter for this FSDP unit Run forward computation -* Discard non-owned parameter shards it has just collected to free memory + parameter for this FSDP unit and run the forward computation +* Discard the non-owned parameter shards it has just collected to free memory -*In backward pass* +*In the backward pass* * Run `all_gather` to collect all shards from all ranks to recover the full - parameter in this FSDP unit Run backward computation -* Discard non-owned parameters to free memory. + parameter in this FSDP unit and run backward computation +* Discard non-owned parameters to free memory. * Run reduce_scatter to sync gradients @@ -80,15 +99,11 @@ examples *Setup* -1.1 Install PyTorch Nightlies - -We will install PyTorch nightlies, as some of the features such as activation -checkpointing is available in nightlies and will be added in next PyTorch -release after 1.12. +1.1 Install the latest PyTorch -.. code-block:: bash +.. code-block:: bash - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html + pip3 install torch torchvision torchaudio 1.2 Dataset Setup @@ -154,7 +169,7 @@ Next, we add the following code snippets to a Python script “T5_training.py” import tqdm from datetime import datetime -1.4 Distributed training setup. +1.4 Distributed training setup. Here we use two helper functions to initialize the processes for distributed training, and then to clean up after training completion. In this tutorial, we are going to use torch elastic, using `torchrun @@ -191,13 +206,13 @@ metrics. date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p") print(f"--> current date and time of run = {date_of_run}") return date_of_run - + def format_metrics_to_gb(item): """quick function to format numbers to gigabyte and round to 4 digit precision""" metric_num = item / g_gigabyte metric_num = round(metric_num, ndigits=4) return metric_num - + 2.2 Define a train function: @@ -275,7 +290,7 @@ metrics. .. code-block:: python - + def fsdp_main(args): model, tokenizer = setup_model("t5-base") @@ -292,7 +307,7 @@ metrics. #wikihow(tokenizer, type_path, num_samples, input_length, output_length, print_text=False) - train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) + train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) val_dataset = wikihow(tokenizer, 'validation', 300, 512, 150, False) sampler1 = DistributedSampler(train_dataset, rank=rank, num_replicas=world_size, shuffle=True) @@ -430,7 +445,7 @@ metrics. .. code-block:: python - + if __name__ == '__main__': # Training settings parser = argparse.ArgumentParser(description='PyTorch T5 FSDP Example') @@ -463,7 +478,7 @@ metrics. To run the the training using torchrun: -.. code-block:: bash +.. code-block:: bash torchrun --nnodes 1 --nproc_per_node 4 T5_training.py @@ -487,7 +502,7 @@ communication efficient. In PyTorch 1.12, FSDP added this support and now we have a wrapping policy for transfomers. It can be created as follows, where the T5Block represents the T5 transformer -layer class (holding MHSA and FFN). +layer class (holding MHSA and FFN). .. code-block:: python @@ -499,10 +514,10 @@ layer class (holding MHSA and FFN). }, ) torch.cuda.set_device(local_rank) - + model = FSDP(model, - fsdp_auto_wrap_policy=t5_auto_wrap_policy) + auto_wrap_policy=t5_auto_wrap_policy) To see the wrapped model, you can easily print the model and visually inspect the sharding and FSDP units as well. @@ -513,22 +528,22 @@ Mixed Precision FSDP supports flexible mixed precision training allowing for arbitrary reduced precision types (such as fp16 or bfloat16). Currently BFloat16 is only available on Ampere GPUs, so you need to confirm native support before you use it. On -V100s for example, BFloat16 can still be run but due to it running non-natively, +V100s for example, BFloat16 can still be run but because it runs non-natively, it can result in significant slowdowns. To check if BFloat16 is natively supported, you can use the following : .. code-block:: python - + bf16_ready = ( torch.version.cuda - and torch.cuda.is_bf16_supported() + and torch.cuda.is_bf16_supported() and LooseVersion(torch.version.cuda) >= "11.0" and dist.is_nccl_available() and nccl.version() >= (2, 10) ) -One of the advantages of mixed percision in FSDP is providing granular control +One of the advantages of mixed precision in FSDP is providing granular control over different precision levels for parameters, gradients, and buffers as follows: @@ -571,7 +586,7 @@ with the following policy: .. code-block:: bash grad_bf16 = MixedPrecision(reduce_dtype=torch.bfloat16) - + In 2.4 we just add the relevant mixed precision policy to the FSDP wrapper: @@ -604,9 +619,9 @@ CPU-based initialization: auto_wrap_policy=t5_auto_wrap_policy, mixed_precision=bfSixteen, device_id=torch.cuda.current_device()) - - + + Sharding Strategy ----------------- FSDP sharding strategy by default is set to fully shard the model parameters, @@ -627,7 +642,7 @@ instead of "ShardingStrategy.FULL_SHARD" to the FSDP initialization as follows: sharding_strategy=ShardingStrategy.SHARD_GRAD_OP # ZERO2) This will reduce the communication overhead in FSDP, in this case, it holds full -parameters after forward and through the backwards pass. +parameters after forward and through the backwards pass. This saves an all_gather during backwards so there is less communication at the cost of a higher memory footprint. Note that full model params are freed at the @@ -652,12 +667,12 @@ wrapper in 2.4 as follows: mixed_precision=bfSixteen, device_id=torch.cuda.current_device(), backward_prefetch = BackwardPrefetch.BACKWARD_PRE) - + `backward_prefetch` has two modes, `BACKWARD_PRE` and `BACKWARD_POST`. `BACKWARD_POST` means that the next FSDP unit's params will not be requested until the current FSDP unit processing is complete, thus minimizing memory overhead. In some cases, using `BACKWARD_PRE` can increase model training speed -up to 2-10%, with even higher speed improvements noted for larger models. +up to 2-10%, with even higher speed improvements noted for larger models. Model Checkpoint Saving, by streaming to the Rank0 CPU ------------------------------------------------------ @@ -696,7 +711,7 @@ Pytorch 1.12 and used HF T5 as the running example. Using the proper wrapping policy especially for transformer models, along with mixed precision and backward prefetch should speed up your training runs. Also, features such as initializing the model on device, and checkpoint saving via streaming to CPU -should help to avoid OOM error in dealing with large models. +should help to avoid OOM error in dealing with large models. We are actively working to add new features to FSDP for the next release. If you have feedback, feature requests, questions or are encountering issues diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index 034225ec469..075480adbf0 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -1,447 +1,411 @@ -Getting Started with Fully Sharded Data Parallel(FSDP) +Getting Started with Fully Sharded Data Parallel (FSDP2) ====================================================== -**Author**: `Hamid Shojanazeri `__, `Yanli Zhao `__, `Shen Li `__ +**Author**: `Wei Feng `__, `Will Constable `__, `Yifan Mao `__ .. note:: - |edit| View and edit this tutorial in `github `__. + |edit| Check out the code in this tutorial from `pytorch/examples `_. FSDP1 is deprecated. FSDP1 tutorials are archived in `[1] `_ and `[2] `_ -Training AI models at a large scale is a challenging task that requires a lot of compute power and resources. -It also comes with considerable engineering complexity to handle the training of these very large models. -`PyTorch FSDP `__, released in PyTorch 1.11 makes this easier. - -In this tutorial, we show how to use `FSDP APIs `__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models `__, -`GPT 3 models up to 1T parameters `__ . The sample DDP MNIST code has been borrowed from `here `__. - - -How FSDP works +How FSDP2 works -------------- -In `DistributedDataParallel `__, (DDP) training, each process/ worker owns a replica of the model and processes a batch of data, finally it uses all-reduce to sum up gradients over different workers. In DDP the model weights and optimizer states are replicated across all workers. FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks. +In `DistributedDataParallel `__ (DDP) training, each rank owns a model replica and processes a batch of data, finally it uses all-reduce to sync gradients across ranks. -When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation. +Comparing with DDP, FSDP reduces GPU memory footprint by sharding model parameters, gradients, and optimizer states. It makes it feasible to train models that cannot fit on a single GPU. As shown below in the picture, + +* Outside of forward and backward computation, parameters are fully sharded +* Before forward and backward, sharded parameters are all-gathered into unsharded parameters +* Inside backward, local unsharded gradients are reduce-scatterred into sharded gradients +* Optimizer updates sharded parameters with sharded gradients, resulting in sharded optimizer states .. figure:: /_static/img/distributed/fsdp_workflow.png :width: 100% :align: center :alt: FSDP workflow - FSDP Workflow - -At a high level FSDP works as follow: -*In constructor* +FSDP can be considered a decomposition of DDP's all-reduce into reduce-scatter and all-gather operations -* Shard model parameters and each rank only keeps its own shard +.. figure:: /_static/img/distributed/fsdp_sharding.png + :width: 100% + :align: center + :alt: FSDP all-gather and reduce-scatter -*In forward path* -* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit -* Run forward computation -* Discard parameter shards it has just collected +Comparing with `FSDP1 +`__, FSDP2 has following advantages: -*In backward path* +* Representing sharded parameters as `DTensor `_ sharded on dim-i, allowing for easy manipulation of individual parameters, communication-free sharded state dicts, and a simpler meta-device initialization flow. +* Improving memory management system that achieves lower and deterministic GPU memory by avoiding ``recordStream`` (`doc `_) and does so without any CPU synchronization. +* Offering a tensor subclass extension point to customize the all-gather, e.g. for float8 all-gather for float8 linears (`doc `_), and NF4 for QLoRA (`doc `_) +* Mixing frozen and non-frozen parameters can in the same communication group without using extra memory. -* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit -* Run backward computation -* Run reduce_scatter to sync gradients -* Discard parameters. +How to use FSDP2 +--------------- -One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards. +Model Initialization +~~~~~~~~~~~~~~~ -.. figure:: /_static/img/distributed/fsdp_sharding.png - :width: 100% - :align: center - :alt: FSDP allreduce +**Applying fully_shard on submodules**: Different from DDP, we should apply `fully_shard `_ on submodules as well as the root model. In the transformer example below, we applied ``fully_shard`` on each layer first, then the root model - FSDP Allreduce +* During forward computation of ``layers[i]``, the rest of the layers are sharded to reduce memory footprint +* Inside ``fully_shard(model)``, FSDP2 excludes parameters from ``model.layers`` and classify remaining parameters into a parameter group for performant all-gather and reduce-scatter +* ``fully_shard`` moves sharded model to actual training device (eg ``cuda``) -How to use FSDP ---------------- -Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. -*Setup* +**Command**: ``torchrun --nproc_per_node 2 train.py`` -1.1 Install PyTorch along with Torchvision +.. code-block:: python -See the `Get Started guide `__ for information on installation. + from torch.distributed.fsdp import fully_shard, FSDPModule + model = Transformer() + for layer in model.layers: + fully_shard(layer) + fully_shard(model) -We add the following code snippets to a python script “FSDP_mnist.py”. + assert isinstance(model, Transformer) + assert isinstance(model, FSDPModule) + print(model) + # FSDPTransformer( + # (tok_embeddings): Embedding(...) + # ... + # (layers): 3 x FSDPTransformerBlock(...) + # (output): Linear(...) + # ) -1.2 Import necessary packages +We can inspect the nested wrapping with ``print(model)``. ``FSDPTransformer`` is a joint class of `Transformer `_ and `FSDPModule +`_. The same thing happens to `FSDPTransformerBlock `_. All FSDP2 public APIs are exposed through ``FSDPModule``. For example, users can call ``model.unshard()`` to manually control all-gather schedules. See "explicit prefetching" below for details. -.. note:: - This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`. +**model.parameters() as DTensor**: ``fully_shard`` shards parameters across ranks, and convert ``model.parameters()`` from plain ``torch.Tensor`` to DTensor to represent sharded parameters. FSDP2 shards on dim-0 by default so DTensor placements are `Shard(dim=0)`. Say we have N ranks and a parameter with N rows before sharding. After sharding, each rank will have 1 row of the parameter. We can inspect sharded parameters using ``param.to_local()``. .. code-block:: python - # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py - import os - import argparse - import functools - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from torchvision import datasets, transforms + from torch.distributed.tensor import DTensor + for param in model.parameters(): + assert isinstance(param, DTensor) + assert param.placements == (Shard(0),) + # inspect sharded parameters with param.to_local() + optim = torch.optim.Adam(model.parameters(), lr=1e-2) - from torch.optim.lr_scheduler import StepLR +Note the optimizer is constructed after applying ``fully_shard``. Both model and optimizer state dicts are represented in DTensor. - import torch.distributed as dist - import torch.multiprocessing as mp - from torch.nn.parallel import DistributedDataParallel as DDP - from torch.utils.data.distributed import DistributedSampler - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import ( - CPUOffload, - BackwardPrefetch, - ) - from torch.distributed.fsdp.wrap import ( - size_based_auto_wrap_policy, - enable_wrap, - wrap, - ) +DTensor facilitates optimizer, gradient clipping and checkpointing -1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up. +* ``torch.optim.Adam`` and ``torch.nn.utils.clip_grad_norm_`` works out of the box for DTensor parameters. It makes the code consistent between single-device and distributed training +* we can use DTensor and DCP APIs to manipulate parameters to get full state dict, see "state dict" section below for details. For distributed state dicts, we can save/load checkpoints (`doc `_) without extra communication + + +Forward/Backward with Prefetching +~~~~~~~~~~~~~~~ + +**command**: ``torchrun --nproc_per_node 2 train.py`` .. code-block:: python - def setup(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' + for _ in range(epochs): + x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device) + loss = model(x).sum() + loss.backward() + optim.step() + optim.zero_grad() - # initialize the process group - dist.init_process_group("nccl", rank=rank, world_size=world_size) +``fully_shard`` registers forward/backward hooks to all-gather parameters before computation, and reshards parameters after computation. To overlap all-gathers with computation, FSDP2 offers **implicit prefetching** that works out of the box with the training loop above and **explicit prefetching** for advanced users to control all-gather schedules manually. - def cleanup(): - dist.destroy_process_group() +**Implicit Prefetching**: CPU thread issues all-gather i before layer i. All-gathers are queued into its own cuda stream while layer i computation happens in the default stream. For non-cpu-bound workload (eg Transformer with big batch size), all-gather i+1 can overlap with computation for layer i. Implicit prefetching works similarly in the backward, except all-gathers are issued in the reverse of post-forward order. -2.1 Define our toy model for handwritten digit classification. +.. figure:: /_static/img/distributed/fsdp_implicit.png + :width: 100% + :align: center + :alt: FSDP Implicit -.. code-block:: python +We recommend users to start with implicit prefetching to understand the performance out of the box. - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - -2.2 Define a train function +**Explicit Prefetching**: Users can specify forward ordering with `set_modules_to_forward_prefetch `_, and backward ordering with `set_modules_to_backward_prefetch `_. As shown in the code below, CPU thread issue all-gather i + 1 and i + 2 at layer i -.. code-block:: python +Explicit prefetching works well in following situation: + +**CPU-bound workload**: If using implicit prefetching, CPU thread will be too slow to issue all-gather for layer i+1 when kernels from layer i get executed. We have to explicitly issue all-gather i+1 before running forward for layer i + +**Prefetching for 2+ layers**: Implicit prefetching only all-gathers next one layer at a time to keep memory footprint minimum. With explicit prefetching can all-gather multiple layers at a time to possibly for better perf with increased memory. See ``layers_to_prefetch`` in the code - def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): - model.train() - ddp_loss = torch.zeros(2).to(rank) - if sampler: - sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(rank), target.to(rank) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target, reduction='sum') - loss.backward() - optimizer.step() - ddp_loss[0] += loss.item() - ddp_loss[1] += len(data) - - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) - -2.3 Define a validation function +**Issuing 1st all-gather earlier**: Implicit prefetching happens at the time of calling ``model(x)``. The 1st all-gather gets exposed. We can call `model.unshard() `_ explicitly earlier to issue 1st all-gather earlier + +**command**: ``torchrun --nproc_per_node 2 train.py --explicit-prefetching`` .. code-block:: python - def test(model, rank, world_size, test_loader): - model.eval() - correct = 0 - ddp_loss = torch.zeros(3).to(rank) - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(rank), target.to(rank) - output = model(data) - ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() - ddp_loss[2] += len(data) + num_to_forward_prefetch = 2 + for i, layer in enumerate(model.layers): + if i >= len(model.layers) - num_to_forward_prefetch: + break + layers_to_prefetch = [ + model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1) + ] + layer.set_modules_to_forward_prefetch(layers_to_prefetch) + + num_to_backward_prefetch = 2 + for i, layer in enumerate(model.layers): + if i < num_to_backward_prefetch: + continue + layers_to_prefetch = [ + model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1) + ] + layer.set_modules_to_backward_prefetch(layers_to_prefetch) + + for _ in range(epochs): + # trigger 1st all-gather earlier + # this overlaps all-gather with any computation before model(x) + model.unshard() + x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device) + loss = model(x).sum() + loss.backward() + optim.step() + optim.zero_grad() + + +Enabling Mixed Precision +~~~~~~~~~~~~~~~ + +FSDP2 offers a flexible `mixed precision policy `_ to speed up training. One typical use case is + +* Casting float32 parameters to bfloat16 for forward/backward computation, see ``param_dtype=torch.bfloat16`` +* Upcasting gradients to float32 for reduce-scatter to preserve accuracy, see ``reduce_dtype=torch.float32`` + +Comparing with `torch.amp `_, FSDP2 mixed precision has following advantages - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) +* **Performant and flexible parameter casting**: All the parameters inside a ``FSDPModule`` are cast together at the module boundary (before and after before/backward). We can set different mixed precision policies for each layer. For example, the first few layers can be in float32 while remaining layers can be in bfloat16. - if rank == 0: - test_loss = ddp_loss[0] / ddp_loss[2] - print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( - test_loss, int(ddp_loss[1]), int(ddp_loss[2]), - 100. * ddp_loss[1] / ddp_loss[2])) +* **float32 gradient reduction (reduce-scatter)**: Gradients might vary a lot from rank to rank. Reducing gradients in float32 can be critical for numerics. -2.4 Define a distributed train function that wraps the model in FSDP -**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states.** + +**command**: ``torchrun --nproc_per_node 2 train.py --mixed-precision`` .. code-block:: python - def fsdp_main(rank, world_size, args): - setup(rank, world_size) - - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - - dataset1 = datasets.MNIST('../data', train=True, download=True, - transform=transform) - dataset2 = datasets.MNIST('../data', train=False, - transform=transform) - - sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) - sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) - - train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} - test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} - cuda_kwargs = {'num_workers': 2, - 'pin_memory': True, - 'shuffle': False} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - - train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) - test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=100 + model = Transformer(model_args) + fsdp_kwargs = { + "mp_policy": MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, ) - torch.cuda.set_device(rank) - - - init_start_event = torch.cuda.Event(enable_timing=True) - init_end_event = torch.cuda.Event(enable_timing=True) + } + for layer in model.layers: + fully_shard(layer, **fsdp_kwargs) + fully_shard(model, **fsdp_kwargs) + + # sharded parameters are float32 + for param in model.parameters(): + assert param.dtype == torch.float32 + + # unsharded parameters are bfloat16 + model.unshard() + for param in model.parameters(recurse=False): + assert param.dtype == torch.bfloat16 + model.reshard() - model = Net().to(rank) + # optimizer states are in float32 + optim = torch.optim.Adam(model.parameters(), lr=1e-2) - model = FSDP(model) + # training loop + # ... - optimizer = optim.Adadelta(model.parameters(), lr=args.lr) - scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - init_start_event.record() - for epoch in range(1, args.epochs + 1): - train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) - test(model, rank, world_size, test_loader) - scheduler.step() - init_end_event.record() +Gradient Clipping and Optimizer with DTensor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**command**: ``torchrun --nproc_per_node 2 train.py`` + +.. code-block:: python - if rank == 0: - print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") - print(f"{model}") + # optim is constructed base on DTensor model parameters + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + for _ in range(epochs): + x = torch.randint(0, vocab_size, (batch_size, seq_len), device=device) + loss = model(x).sum() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm) + optim.step() + optim.zero_grad() - if args.save_model: - # use a barrier to make sure training is done on all ranks - dist.barrier() - states = model.state_dict() - if rank == 0: - torch.save(states, "mnist_cnn.pt") - - cleanup() +Optimizer is initialized after applying ``fully_shard`` on the model, and holds reference to DTensor ``model.parameters()``. For gradient clipping, ``torch.nn.utils.clip_grad_norm_`` works for DTensor parameters. Tensor ops will be dispatched correctly inside DTensor to communicate partial tensors across ranks to preserve the single device semantic. +State Dicts with DTensor APIs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We showcase how to convert a full state dict into a DTensor state dict for loading, and how to convert it back to full state dict for saving. -2.5 Finally, parse the arguments and set the main function +**command**: ``torchrun --nproc_per_node 2 train.py`` + +* For the 1st time, it creates checkpoints for the model and optimizer +* For the 2nd time, it loads from the previous checkpoint to resume training + +**Loading state dicts**: We initialize the model under meta device and call ``fully_shard`` to convert ``model.parameters()`` from plain ``torch.Tensor`` to DTensor. After reading the full state dict from torch.load, we can call `distribute_tensor `_ to convert plain ``torch.Tensor`` into DTensor, using the same placements and device mesh from ``model.state_dict()``. Finally we can call `model.load_state_dict `_ to load DTensor state dicts into the model. .. code-block:: python - if __name__ == '__main__': - # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 14)') - parser.add_argument('--lr', type=float, default=1.0, metavar='LR', - help='learning rate (default: 1.0)') - parser.add_argument('--gamma', type=float, default=0.7, metavar='M', - help='Learning rate step gamma (default: 0.7)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') - args = parser.parse_args() - - torch.manual_seed(args.seed) - - WORLD_SIZE = torch.cuda.device_count() - mp.spawn(fsdp_main, - args=(WORLD_SIZE, args), - nprocs=WORLD_SIZE, - join=True) - - -We have recorded cuda events to measure the time of FSDP model specifics. The CUDA event time was 110.85 seconds. - -.. code-block:: bash - - python FSDP_mnist.py - - CUDA event elapsed time on training loop 40.67462890625sec - -Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit. -Alternatively, we will look at adding the fsdp_auto_wrap_policy next and will discuss the differences. - -.. code-block:: bash - - FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Net( - (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) - (dropout1): Dropout(p=0.25, inplace=False) - (dropout2): Dropout(p=0.5, inplace=False) - (fc1): Linear(in_features=9216, out_features=128, bias=True) - (fc2): Linear(in_features=128, out_features=10, bias=True) - ) + from torch.distributed.tensor import distribute_tensor + + # mmap=True reduces CPU memory usage + full_sd = torch.load( + "checkpoints/model_state_dict.pt", + mmap=True, + weights_only=True, + map_location='cpu', ) - ) + meta_sharded_sd = model.state_dict() + sharded_sd = {} + for param_name, full_tensor in full_sd.items(): + sharded_meta_param = meta_sharded_sd.get(param_name) + sharded_tensor = distribute_tensor( + full_tensor, + sharded_meta_param.device_mesh, + sharded_meta_param.placements, + ) + sharded_sd[param_name] = nn.Parameter(sharded_tensor) + # `assign=True` since we cannot call `copy_` on meta tensor + model.load_state_dict(sharded_sd, assign=True) -The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. +**Saving state dicts**: ``model.state_dict()`` returns a DTensor state dict. We can convert a DTensor into a plain ``torch.Tensor`` by calling `full_tensor() `_. Internally it issues an all-gather across ranks to get unsharded parameters in plain torch.Tensor. For rank 0, ``full_param.cpu()`` offloads the tensor to cpu one by one to avoid peaking GPU memory with unsharded parameters. +.. code-block:: python + + sharded_sd = model.state_dict() + cpu_state_dict = {} + for param_name, sharded_param in sharded_sd.items(): + full_param = sharded_param.full_tensor() + if torch.distributed.get_rank() == 0: + cpu_state_dict[param_name] = full_param.cpu() + else: + del full_param + torch.save(cpu_state_dict, "checkpoints/model_state_dict.pt") -.. figure:: /_static/img/distributed/FSDP_memory.gif - :width: 100% - :align: center - :alt: FSDP peak memory - FSDP Peak Memory Usage +Optimizer state dict works similarly (`code `_). Users can customize the above DTensor scripts to work with 3rd party checkpoints. -Applying *fsdp_auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. -The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model. -In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding. -Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers. +If there is no need for customization, we can use `DCP APIs `_ directly to support both single-node and multi-node training. -To avoid that, you can pass in an fsdp_auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit). -In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers. -Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100. +State Dict with DCP APIs +~~~~~~~~~~~~~~~~~~~~~~~~ -To do so in 2.4 we define the auto_wrap_policy and pass it to FSDP wrapper, in the following example, my_auto_wrap_policy defines that a layer could be wrapped or sharded by FSDP if the number of parameters in this layer is larger than 100. -If the number of parameters in this layer is smaller than 100, it will be wrapped with other small layers together by FSDP. -Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning for this config in the future. Without an auto tuning tool, it is good to profile your workflow using different auto wrap policies experimentally and find the optimal one. +**command**: ``torchrun --nproc_per_node 2 train.py --dcp-api`` + +* For the 1st time, it creates checkpoints for the model and optimizer +* For the 2nd time, it loads from the previous checkpoint to resume training + +**Loading state dicts**: We can load a full state dict into a FSDP2 model with `set_model_state_dict `_. With ``broadcast_from_rank0=True``, we can load the full state dict only on rank 0 to avoid peaking CPU memory. DCP will shard tensors and broadcast them to other ranks. .. code-block:: python - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=20000 - ) - torch.cuda.set_device(rank) - model = Net().to(rank) - - model = FSDP(model, - fsdp_auto_wrap_policy=my_auto_wrap_policy) - -Applying the fsdp_auto_wrap_policy, the model would be as follows: - -.. code-block:: bash - - FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Net( - (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) - (dropout1): Dropout(p=0.25, inplace=False) - (dropout2): Dropout(p=0.5, inplace=False) - (fc1): FullyShardedDataParallel( - (_fsdp_wrapped_module): FlattenParamsWrapper( - (_fpw_module): Linear(in_features=9216, out_features=128, bias=True) + from torch.distributed.checkpoint.state_dict import set_model_state_dict + set_model_state_dict( + model=model, + model_state_dict=full_sd, + options=StateDictOptions( + full_state_dict=True, + broadcast_from_rank0=True, + ), + ) + +**Saving state dicts**: `get_model_state_dict `_ with ``full_state_dict=True`` and ``cpu_offload=True`` all-gathers tensors and offload them to CPU. It works similarly to DTensor APIs. + +.. code-block:: python + + from torch.distributed.checkpoint.state_dict import get_model_state_dict + model_state_dict = get_model_state_dict( + model=model, + options=StateDictOptions( + full_state_dict=True, + cpu_offload=True, ) - ) - (fc2): Linear(in_features=128, out_features=10, bias=True) ) - ) + torch.save(model_state_dict, "model_state_dict.pt") -.. code-block:: bash +Refer to `pytorch/examples `__ for loading and saving optimizer state dicts with `set_optimizer_state_dict `_ and `get_optimizer_state_dict `_. - python FSDP_mnist.py - CUDA event elapsed time on training loop 41.89130859375sec +FSDP1-to-FSDP2 migration guide +--------------- -The following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. -It can be observed that the peak memory usage on each device is smaller compared to FSDP without auto wrap policy applied, from ~75 MB to 66 MB. +Let’s look at an example of an `FSDP `_ usage and an equivalent `fully_shard `_ usage. We’ll highlight the key differences and suggest steps for migration. -.. figure:: /_static/img/distributed/FSDP_autowrap.gif - :width: 100% - :align: center - :alt: FSDP peak memory +Original FSDP() usage - FSDP Peak Memory Usage using Auto_wrap policy +.. code-block:: python -*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into GPUs, then CPU offload can be helpful here. + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + with torch.device("meta"): + model = Transformer() + policy = ModuleWrapPolicy({TransformerBlock}) + model = FSDP(model, auto_wrap_policy=policy) + def param_init_fn(module: nn.Module) -> None: ... + model = FSDP(model, auto_wrap_policy=policy, param_init_fn=param_init_fn) -Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in cpu_offload=CPUOffload(offload_params=True). +New fully_shard() usage -Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. The default is None in which case there will be no offloading. +.. code-block:: python -Using this feature may slow down the training considerably, due to frequent copying of tensors from host to device, but it could help improve memory efficiency and train larger scale models. + with torch.device("meta"): + model = Transformer() + for module in model.modules(): + if isinstance(module, TransformerBlock): + fully_shard(module) + fully_shard(model) + for tensor in itertools.chain(model.parameters(), model.buffers()): + assert tensor.device == torch.device("meta") -In 2.4 we just add it to the FSDP wrapper + # Initialize the model after sharding + model.to_empty(device="cuda") + model.reset_parameters() -.. code-block:: python +Migration Steps - model = FSDP(model, - fsdp_auto_wrap_policy=my_auto_wrap_policy, - cpu_offload=CPUOffload(offload_params=True)) +* Replace the imports +* Implement your ‘policy’ directly (apply ``fully_shard`` to the desired sublayers) +* Wrap your root model with ``fully_shard`` instead of ``FSDP`` +* Get rid of ``param_init_fn`` and manually call ``model.reset_parameters()`` +* Replace other FSDP1 kwargs (see below) -Compare it with DDP, if in 2.4 we just normally wrap the model in DPP, saving the changes in “DDP_mnist.py”. +sharding_strategy -.. code-block:: python +* FULL_SHARD: ``reshard_after_forward=True`` +* SHARD_GRAD_OP: ``reshard_after_forward=False`` +* HYBRID_SHARD: ``reshard_after_forward=True`` with a 2D device mesh +* _HYBRID_SHARD_ZERO2: ``reshard_after_forward=False`` with a 2D device mesh - model = Net().to(rank) - model = DDP(model) +cpu_offload +* CPUOffload.offload_params=False: ``offload_policy=None`` +* CPUOffload.offload_params = True: ``offload_policy=CPUOffloadPolicy()`` -.. code-block:: bash +backward_prefetch - python DDP_mnist.py +* BACKWARD_PRE: always used +* BACKWARD_POST: not supported - CUDA event elapsed time on training loop 39.77766015625sec +mixed_precision -The following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch profiler. +* ``buffer_dtype`` is omitted because fully_shard does not shard buffers +* fully_shard’s ``cast_forward_inputs`` maps to both ``cast_forward_inputs`` and ``cast_root_forward_inputs`` in FSDP1 +* ``output_dtype`` is a new config for fully_shard -.. figure:: /_static/img/distributed/DDP_memory.gif - :width: 100% - :align: center - :alt: FSDP peak memory +device_id: Inferred from device_mesh’s device + +sync_module_states=True/False: Moved to DCP. User can broadcast state dicts from rank0 using `set_model_state_dict `_ with ``broadcast_from_rank0=True`` + +forward_prefetch: Manual control over prefetching is possible with + +* Manually call `fsdp_module.unshard() `_ +* Use these APIs to control automatic prefetching, `set_modules_to_forward_prefetch `_ and `set_modules_to_backward_prefetch `_ - DDP Peak Memory Usage using Auto_wrap policy +limit_all_gathers: No longer needed, because ``fully_shard`` removed cpu synchronization +use_orig_params: Original params are always used (no more flat parameter) -Considering the toy example and tiny MNIST model we defined here, we can observe the difference between peak memory usage of DDP and FSDP. -In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP which shards the model parameters, optimizer states and gradients over DDP ranks. -The peak memory usage using FSDP with auto_wrap policy is the lowest followed by FSDP and DDP. +no_sync(): `set_requires_gradient_sync `_ -Also, looking at timings, considering the small model and running the training on a single machine, FSDP with and without auto_wrap policy performed almost as fast as DDP. -This example does not represent most of the real applications, for detailed analysis and comparison between DDP and FSDP please refer to this `blog post `__ . +ignored_params and ignored_states: `ignored_params `_ diff --git a/intermediate_source/README.txt b/intermediate_source/README.txt index 0307e89a1a9..ecc8eb74af4 100644 --- a/intermediate_source/README.txt +++ b/intermediate_source/README.txt @@ -29,10 +29,6 @@ Intermediate tutorials Spatial Transformer Networks Tutorial https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html -8. flask_rest_api_tutorial.py - Deploying PyTorch and Building a REST API using Flask - https://pytorch.org/tutorials/intermediate/flask_rest_api_tutorial.html - -9. nvfuser_intro_tutorial.py +8. nvfuser_intro_tutorial.py Introduction to nvFuser https://pytorch.org/tutorials/intermediate/nvfuser_intro_tutorial.html diff --git a/intermediate_source/TCPStore_libuv_backend.rst b/intermediate_source/TCPStore_libuv_backend.rst new file mode 100644 index 00000000000..1e285eba7c4 --- /dev/null +++ b/intermediate_source/TCPStore_libuv_backend.rst @@ -0,0 +1,286 @@ +Introduction to Libuv TCPStore Backend +====================================== +**Authors**: `Xilun Wu `_ + +.. note:: + |edit| View and edit this tutorial in `github `__. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * What is the new TCPStore backend + * Compare the new libuv backend against the legacy backend + * How to enable to use the legacy backend + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 or later + * Read about the `TCPStore API `__. + + +Introduction +------------ + +Recently, we have rolled out a new TCPStore server backend using `libuv `__, a third-party library for asynchronous I/O. This new server backend aims to +address scalability and robustness challenges in large-scale distributed training jobs, such as those with more than 1024 ranks. We ran a series of +benchmarks to compare the libuv backend against the old one, and the experiment results demonstrated significant improvements in store initialization +time and maintained a comparable performance in store I/O operations. + +As a result of these findings, the libuv backend has been set as the default TCPStore server backend in PyTorch 2.4. This change is expected to enhance +the performance and scalability of distributed training jobs. + +This change introduces a slight incompatibility to store initialization. For users who wish to continue using the legacy backend, the tutorial will +provide guidance on how to specify to use the previous TCPStore server backend. + + +Performance Benchmark +--------------------- + +To better demonstrate the benefit of our new libuv TCPStore backend, we set up a benchmark over a wide range of job size, from 1024 (1K) to 98304 (96K) ranks. +We first measured the TCPStore initialization time using the code snippet below: + +.. code:: python + + import logging + import os + + from time import perf_counter + + import torch + import torch.distributed as dist + + logger: logging.Logger = logging.getLogger(__name__) + + # Env var are preset when launching the benchmark + env_rank = os.environ.get("RANK", 0) + env_world_size = os.environ.get("WORLD_SIZE", 1) + env_master_addr = os.environ.get("MASTER_ADDR", "localhost") + env_master_port = os.environ.get("MASTER_PORT", "23456") + + start = perf_counter() + tcp_store = dist.TCPStore( + env_master_addr, + int(env_master_port), + world_size=int(env_world_size), + is_master=(int(env_rank) == 0), + ) + end = perf_counter() + time_elapsed = end - start + logger.info( + f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." + ) + +Since the execution of the TCPStore server thread will be blocked until all clients are successfully connected, we take the time measured on rank 0 as the total +TCPStore initialization runtime. The experiment numbers are reported in the figure below: + +.. figure:: /_static/img/distributed/tcpstore_init_time.png + :width: 100% + :align: center + :alt: TCPStore Initialization Runtime Benchmark Result + +Figure 1. shows some significant evidence that the libuv backend is superior to the legacy backend: + +- TCPStore with libuv backend always has a faster initialization than the legacy backend, especially at super-large scale +- The legacy backend would timeout at server-client connecting at 96K scale (for example, over 30 minutes) while the libuv backend completed the initialization in 100 seconds. + +The second benchmark we did is to measure the runtime of TCPStore ``store_based_barrier`` operation: + +.. code:: python + + import logging + import os + import time + + from datetime import timedelta + from time import perf_counter + + import torch + import torch.distributed as dist + + DistStoreError = torch._C._DistStoreError + logger: logging.Logger = logging.getLogger(__name__) + + # since dist._store_based_barrier is a private function and cannot be directly called, we need to write a function which does the same + def store_based_barrier( + rank, + store, + group_name, + rendezvous_count, + timeout=dist.constants.default_pg_timeout, + logging_interval=timedelta(seconds=10), + ): + store_key = f"store_based_barrier_key:{group_name}" + store.add(store_key, 1) + + world_size = rendezvous_count + worker_count = store.add(store_key, 0) + + last_worker_key = f"{store_key}:last_worker" + if worker_count == world_size: + store.set(last_worker_key, "1") + + start = time.time() + while True: + try: + # This will throw an exception after the logging_interval in which we print out + # the status of the group or time out officially, throwing runtime error + store.wait([last_worker_key], logging_interval) + break + except RuntimeError as e: + worker_count = store.add(store_key, 0) + # Print status periodically to keep track. + logger.info( + "Waiting in store based barrier to initialize process group for " + "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)" + "error: %s", + rank, + store_key, + world_size, + worker_count, + timeout, + e, + ) + + if timedelta(seconds=(time.time() - start)) > timeout: + raise DistStoreError( + "Timed out initializing process group in store based barrier on " + "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout + ) + ) + + logger.info( + "Rank %s: Completed store-based barrier for key:%s with %s nodes.", + rank, + store_key, + world_size, + ) + + # Env var are preset when launching the benchmark + env_rank = os.environ.get("RANK", 0) + env_world_size = os.environ.get("WORLD_SIZE", 1) + env_master_addr = os.environ.get("MASTER_ADDR", "localhost") + env_master_port = os.environ.get("MASTER_PORT", "23456") + + tcp_store = dist.TCPStore( + env_master_addr, + int(env_master_port), + world_size=int(env_world_size), + is_master=(int(env_rank) == 0), + ) + + # sync workers + store_based_barrier(int(env_rank), tcp_store, "tcpstore_test", int(env_world_size)) + + number_runs = 10 + start = perf_counter() + for _ in range(number_runs): + store_based_barrier( + int(env_rank), tcp_store, "tcpstore_test", int(env_world_size) + ) + end = perf_counter() + time_elapsed = end - start + logger.info( + f"Complete {number_runs} TCPStore barrier runs with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." + ) + +We compute the average by dividing the runtime measured on rank 0 by ``number_runs`` and report it in the figure below: + +.. figure:: /_static/img/distributed/tcpstore_barrier_time.png + :width: 100% + :align: center + :alt: TCPStore Barrier Runtime Benchmark Result + +Figure 2. shows that the I/O performance of libuv backend is comparable to the legacy backend: + +- The libuv backend has a comparable performance over the whole spectrum in terms of the number of ranks +- The libuv backend runtime is more stable than the legacy backend as the number of ranks grows + + +Impact +------ + +One incompatibility that users may need to pay attention is, TCPStore currently does not support initialization with a ``listen_fd`` when using libuv backend. +If the user wants to keep using this initialization method, the user can simply pass ``use_libuv=False`` to stay with the old TCPStore backend. + +.. code:: python + + import socket + + import torch + import torch.distributed as dist + + listen_sock: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listen_sock.bind(("localhost", 0)) + addr, port, *_ = listen_sock.getsockname() + listen_fd = listen_sock.detach() + + tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd) # expect NotImplementedError + tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd, use_libuv=False) # OK. Use legacy backend + + +Exit Route 1: Pass ``use_libuv=False`` to TCPStore Initialization +----------------------------------------------------------------- + +As the above code snippet shows, if user calls TCPStore init method to create a store, simply passing ``use_libuv=False`` allows user to remain using the old +TCPStore backend. This override has the highest priority over other approaches determining which backend the TCPStore server should choose. + + +Exit Route 2: Add ``use_libuv=0`` to ``init_method`` at ProcessGroup Initialization +----------------------------------------------------------------------------------- + +``ProcessGroup`` creates a TCPStore if user does not explicitly pass one to its initialization. User can add the query option ``use_libuv=0`` to ``init_method`` when +initializing the ``ProcessGroup``. This approach has lower priority than Exit Route 1. + +.. code:: python + + import torch + import torch.distributed as dist + + addr = "localhost" + port = 23456 + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=0, + world_size=1, + init_method=f"tcp://{addr}:{port}?use_libuv=0", + ) + dist.destroy_process_group() + + +Exit Route 3: Set Environment Variable ``USE_LIBUV`` to ``0`` +------------------------------------------------------------- + +When ProcessGroup creates a TCPStore, it also checks the environment vairable ``USE_LIBUV`` to determine which TCPStore backend to use. User can set the environment +variable ``"USE_LIBUV"`` to ``"0"`` to specify the use of old TCPStore backend. This approach has lower priority than Exit Route 2, for example, if the user sets environment +variable ``USE_LIBUV`` to ``1`` and also passes ``use_libuv=0`` in ``init_method``, then the old store backend will be chosen. + +.. code:: python + + import os + + import torch + import torch.distributed as dist + + addr = "localhost" + port = 23456 + os.environ["USE_LIBUV"] = "0" + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=0, + world_size=1, + init_method=f"tcp://{addr}:{port}", + ) + dist.destroy_process_group() + + +Conclusion +---------- +In PyTorch 2.4, we made the new libuv TCPStore backend the default. Although the new backend has incompatibility with initialization from a ``listen_fd``, it +shows significant performance improvement on store initialization at large-scale and compatible performance on store I/O at small/medium/large scales, which +brings a major benefit to Distributed Training's control plane. This tutorial explains our motivation, goes through the performance benchmark, notifies users +of the potential impact, and introduces three exit routes to remain using the legacy backend. In the long term, we aim to eventually deprecate the legacy backend. diff --git a/intermediate_source/TP_tutorial.rst b/intermediate_source/TP_tutorial.rst index 2d0193990d4..6d3e7b60c68 100644 --- a/intermediate_source/TP_tutorial.rst +++ b/intermediate_source/TP_tutorial.rst @@ -83,8 +83,6 @@ To see how to utilize DeviceMesh to set up multi-dimensional parallelisms, pleas .. code-block:: python - # run this via torchrun: torchrun --standalone --nproc_per_node=8 ./tp_tutorial.py - from torch.distributed.device_mesh import init_device_mesh tp_mesh = init_device_mesh("cuda", (8,)) @@ -130,9 +128,9 @@ q/k/v projection and row-wise sharding for the ``wo`` linear projection. So we c layer_tp_plan = { # by default ColwiseParallel input layouts is replicated # and RowwiseParallel output layouts is replicated - "attention.wq": ColwiseParallel(), - "attention.wk": ColwiseParallel(), - "attention.wv": ColwiseParallel(), + "attention.wq": ColwiseParallel(use_local_output=False), + "attention.wk": ColwiseParallel(use_local_output=False), + "attention.wv": ColwiseParallel(use_local_output=False), "attention.wo": RowwiseParallel(), "feed_forward.w1": ColwiseParallel(), "feed_forward.w2": RowwiseParallel(), @@ -143,7 +141,7 @@ q/k/v projection and row-wise sharding for the ``wo`` linear projection. So we c This is almost the ``layer_tp_plan`` we need to apply Tensor Parallelism to the ``TransformerBlock``. However, one thing we should be aware is that when sharding the linear layer column-wise, the output of the linear layers would become sharded on the last tensor dimension, and the row-wise sharding linear layer directly accepts an input that shards on the last dimension. If there are any more tensor operations (such as view operations) between the column-wise linear and the row-wise linear, we would need to adjust the relevant shape related ops to sharded shape. -For the Llama model, in the attention layer there are couple of view operations that are shape related. In particular, column-wise parallel for ``wq``/ ``wk``/ ``wv`` linear layers, the activation tensor is sharded on the ``num_heads`` dimension, so we would need to adjust the ``num_heads`` to local ``num_heads``. +For the Llama model, in the attention layer, there are several view operations related to shape. Specifically, for column-wise parallelism in the ``wq``/``wk``/``wv`` linear layers, the activation tensor is sharded on the ``num_heads`` dimension. To manage the difference between global and local ``num_heads``, we should set ``use_local_output=False`` to ensure the output is a DTensor. Unlike a regular tensor, a DTensor is aware of the parallelism plans and will automatically handle changes in the ``num_heads`` dimension. Finally, we need to call ``parallelize_module`` API to make the plan for each ``TransformerBlock`` effective. Under the hood, it distributes the model parameters inside ``Attention`` and ``FeedForward`` layers to DTensors, and registers communication hooks for model inputs and outputs (before and after each module respectively), if necessary: @@ -152,11 +150,6 @@ Finally, we need to call ``parallelize_module`` API to make the plan for each `` for layer_id, transformer_block in enumerate(model.layers): layer_tp_plan = {...} # i.e. the plan we just generated - # Adjust attention module to use the local number of heads - attn_layer = transformer_block.attention - attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size() - attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size() - parallelize_module( module=transformer_block, device_mesh=tp_mesh, @@ -221,12 +214,12 @@ Next let's adjust the ``layer_tp_plan`` to enable sequence parallel on the ``RMS # to represent the input/output tensors sharded on the sequence dimension "attention_norm": SequenceParallel(), "attention": PrepareModuleInput( - input_layouts=(Shard(1),), - desired_input_layouts=(Replicate(),), + input_layouts=(Shard(1), Replicate()), + desired_input_layouts=(Replicate(), Replicate()), ), - "attention.wq": ColwiseParallel(), - "attention.wk": ColwiseParallel(), - "attention.wv": ColwiseParallel(), + "attention.wq": ColwiseParallel(use_local_output=False), + "attention.wk": ColwiseParallel(use_local_output=False), + "attention.wv": ColwiseParallel(use_local_output=False), "attention.wo": RowwiseParallel(output_layouts=Shard(1)), "ffn_norm": SequenceParallel(), "feed_forward": PrepareModuleInput( @@ -335,7 +328,7 @@ This 2-D parallelism pattern can be easily expressed via a 2-D DeviceMesh, and w from torch.distributed.device_mesh import init_device_mesh from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import fully_shard # i.e. 2-D mesh is [dp, tp], training on 64 GPUs that performs 8 way DP and 8 way TP mesh_2d = init_device_mesh("cuda", (8, 8)) @@ -349,7 +342,7 @@ This 2-D parallelism pattern can be easily expressed via a 2-D DeviceMesh, and w # apply Tensor Parallel intra-host on tp_mesh model_tp = parallelize_module(model, tp_mesh, tp_plan) # apply FSDP inter-host on dp_mesh - model_2d = FSDP(model_tp, device_mesh=dp_mesh, use_orig_params=True, ...) + model_2d = fully_shard(model_tp, mesh=dp_mesh, ...) This would allow us to easily apply Tensor Parallel within each host (intra-host) and apply FSDP across hosts (inter-hosts), with **0-code changes** to the Llama model. @@ -360,4 +353,4 @@ Conclusion This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel in combination with Fully Sharded Data Parallel. It explains how to apply Tensor Parallel to different parts of the model, with **no code changes** to the model itself. Tensor Parallel is a efficient model parallelism technique for large scale training. -To see the complete end to end code example explained in this tutorial, please refer to the `Tensor Parallel examples `__ in the pytorch/examples repository. +To see the complete end-to-end code example explained in this tutorial, please refer to the `Tensor Parallel examples `__ in the pytorch/examples repository. diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py index f16b170ee6a..ed581426c2e 100644 --- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py +++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py @@ -397,7 +397,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - return torch.load(name) + return torch.load(name, weights_only=True) ###################################################################### @@ -420,7 +420,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - tensor = torch.load(name) + tensor = torch.load(name, weights_only=True) os.remove(name) return tensor @@ -462,7 +462,7 @@ def pack_hook(tensor): return temp_file def unpack_hook(temp_file): - return torch.load(temp_file.name) + return torch.load(temp_file.name, weights_only=True) ###################################################################### diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py index 79b096b9e64..0f1ae21a556 100644 --- a/intermediate_source/ax_multiobjective_nas_tutorial.py +++ b/intermediate_source/ax_multiobjective_nas_tutorial.py @@ -232,21 +232,21 @@ def trainer( # we get the logic to read and parse the TensorBoard logs for free. # -from ax.metrics.tensorboard import TensorboardCurveMetric +from ax.metrics.tensorboard import TensorboardMetric +from tensorboard.backend.event_processing import plugin_event_multiplexer as event_multiplexer - -class MyTensorboardMetric(TensorboardCurveMetric): +class MyTensorboardMetric(TensorboardMetric): # NOTE: We need to tell the new TensorBoard metric how to get the id / # file handle for the TensorBoard logs from a trial. In this case # our convention is to just save a separate file per trial in # the prespecified log dir. - @classmethod - def get_ids_from_trials(cls, trials): - return { - trial.index: Path(log_dir).joinpath(str(trial.index)).as_posix() - for trial in trials - } + def _get_event_multiplexer_for_trial(self, trial): + mul = event_multiplexer.EventMultiplexer(max_reload_threads=20) + mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None) + mul.Reload() + + return mul # This indicates whether the metric is queryable while the trial is # still running. We don't use this in the current tutorial, but Ax @@ -266,12 +266,12 @@ def is_available_while_running(cls): val_acc = MyTensorboardMetric( name="val_acc", - curve_name="val_acc", + tag="val_acc", lower_is_better=False, ) model_num_params = MyTensorboardMetric( name="num_params", - curve_name="num_params", + tag="num_params", lower_is_better=True, ) diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py index 8451f07b829..04cfb16f627 100644 --- a/intermediate_source/char_rnn_classification_tutorial.py +++ b/intermediate_source/char_rnn_classification_tutorial.py @@ -4,13 +4,18 @@ ************************************************************** **Author**: `Sean Robertson `_ +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + We will be building and training a basic character-level Recurrent Neural Network (RNN) to classify words. This tutorial, along with two other Natural Language Processing (NLP) "from scratch" tutorials :doc:`/intermediate/char_rnn_generation_tutorial` and :doc:`/intermediate/seq2seq_translation_tutorial`, show how to -preprocess data to model NLP. In particular these tutorials do not -use many of the convenience functions of `torchtext`, so you can see how +preprocess data to model NLP. In particular, these tutorials show how preprocessing to model NLP works at a low level. A character-level RNN reads words as a series of characters - @@ -20,20 +25,7 @@ Specifically, we'll train on a few thousand surnames from 18 languages of origin, and predict which language a name is from based on the -spelling: - -.. code-block:: sh - - $ python predict.py Hinton - (-0.47) Scottish - (-1.52) English - (-3.57) Irish - - $ python predict.py Schmidhuber - (-0.19) German - (-2.48) Czech - (-2.68) Dutch - +spelling. Recommended Preparation ======================= @@ -56,79 +48,63 @@ Networks `__ is about LSTMs specifically but also informative about RNNs in general +""" +###################################################################### +# Preparing Torch +# ========================== +# +# Set up torch to default to the right device use GPU acceleration depending on your hardware (CPU or CUDA). +# -Preparing the Data -================== - -.. note:: - Download the data from - `here `_ - and extract it to the current directory. - -Included in the ``data/names`` directory are 18 text files named as -``[Language].txt``. Each file contains a bunch of names, one name per -line, mostly romanized (but we still need to convert from Unicode to -ASCII). +import torch -We'll end up with a dictionary of lists of names per language, -``{language: [names ...]}``. The generic variables "category" and "line" -(for language and name in our case) are used for later extensibility. -""" -from io import open -import glob -import os +# Check if CUDA is available +device = torch.device('cpu') +if torch.cuda.is_available(): + device = torch.device('cuda') -def findFiles(path): return glob.glob(path) +torch.set_default_device(device) +print(f"Using device = {torch.get_default_device()}") -print(findFiles('data/names/*.txt')) +###################################################################### +# Preparing the Data +# ================== +# +# Download the data from `here `__ +# and extract it to the current directory. +# +# Included in the ``data/names`` directory are 18 text files named as +# ``[Language].txt``. Each file contains a bunch of names, one name per +# line, mostly romanized (but we still need to convert from Unicode to +# ASCII). +# +# The first step is to define and clean our data. Initially, we need to convert Unicode to plain ASCII to +# limit the RNN input layers. This is accomplished by converting Unicode strings to ASCII and allowing only a small set of allowed characters. -import unicodedata import string +import unicodedata -all_letters = string.ascii_letters + " .,;'" -n_letters = len(all_letters) +# We can use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model +allowed_characters = string.ascii_letters + " .,;'" + "_" +n_letters = len(allowed_characters) # Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' - and c in all_letters + and c in allowed_characters ) -print(unicodeToAscii('Ślusàrski')) - -# Build the category_lines dictionary, a list of names per language -category_lines = {} -all_categories = [] - -# Read a file and split into lines -def readLines(filename): - lines = open(filename, encoding='utf-8').read().strip().split('\n') - return [unicodeToAscii(line) for line in lines] - -for filename in findFiles('data/names/*.txt'): - category = os.path.splitext(os.path.basename(filename))[0] - all_categories.append(category) - lines = readLines(filename) - category_lines[category] = lines - -n_categories = len(all_categories) - - -###################################################################### -# Now we have ``category_lines``, a dictionary mapping each category -# (language) to a list of lines (names). We also kept track of -# ``all_categories`` (just a list of languages) and ``n_categories`` for -# later reference. +######################### +# Here's an example of converting a unicode alphabet name to plain ASCII. This simplifies the input layer # -print(category_lines['Italian'][:5]) - +print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}") ###################################################################### # Turning Names into Tensors -# -------------------------- +# ========================== # # Now that we have all the names organized, we need to turn them into # Tensors to make any use of them. @@ -142,19 +118,14 @@ def readLines(filename): # # That extra 1 dimension is because PyTorch assumes everything is in # batches - we're just using a batch size of 1 here. -# - -import torch # Find letter index from all_letters, e.g. "a" = 0 def letterToIndex(letter): - return all_letters.find(letter) - -# Just for demonstration, turn a letter into a <1 x n_letters> Tensor -def letterToTensor(letter): - tensor = torch.zeros(1, n_letters) - tensor[0][letterToIndex(letter)] = 1 - return tensor + # return our out-of-vocabulary character if we encounter a letter unknown to our model + if letter not in allowed_characters: + return allowed_characters.find("_") + else: + return allowed_characters.find(letter) # Turn a line into a , # or an array of one-hot letter vectors @@ -164,9 +135,87 @@ def lineToTensor(line): tensor[li][0][letterToIndex(letter)] = 1 return tensor -print(letterToTensor('J')) +######################### +# Here are some examples of how to use ``lineToTensor()`` for a single and multiple character string. -print(lineToTensor('Jones').size()) +print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1 +print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1 + +######################### +# Congratulations, you have built the foundational tensor objects for this learning task! You can use a similar approach +# for other RNN tasks with text. +# +# Next, we need to combine all our examples into a dataset so we can train, test and validate our models. For this, +# we will use the `Dataset and DataLoader `__ classes +# to hold our dataset. Each Dataset needs to implement three functions: ``__init__``, ``__len__``, and ``__getitem__``. +from io import open +import glob +import os +import time + +import torch +from torch.utils.data import Dataset + +class NamesDataset(Dataset): + + def __init__(self, data_dir): + self.data_dir = data_dir #for provenance of the dataset + self.load_time = time.localtime #for provenance of the dataset + labels_set = set() #set of all classes + + self.data = [] + self.data_tensors = [] + self.labels = [] + self.labels_tensors = [] + + #read all the ``.txt`` files in the specified directory + text_files = glob.glob(os.path.join(data_dir, '*.txt')) + for filename in text_files: + label = os.path.splitext(os.path.basename(filename))[0] + labels_set.add(label) + lines = open(filename, encoding='utf-8').read().strip().split('\n') + for name in lines: + self.data.append(name) + self.data_tensors.append(lineToTensor(name)) + self.labels.append(label) + + #Cache the tensor representation of the labels + self.labels_uniq = list(labels_set) + for idx in range(len(self.labels)): + temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long) + self.labels_tensors.append(temp_tensor) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data_item = self.data[idx] + data_label = self.labels[idx] + data_tensor = self.data_tensors[idx] + label_tensor = self.labels_tensors[idx] + + return label_tensor, data_tensor, data_label, data_item + + +######################### +#Here we can load our example data into the ``NamesDataset`` + +alldata = NamesDataset("data/names") +print(f"loaded {len(alldata)} items of data") +print(f"example = {alldata[0]}") + +######################### +#Using the dataset object allows us to easily split the data into train and test sets. Here we create a 85/15 +# split but the ``torch.utils.data`` has more useful utilities. Here we specify a generator since we need to use the +#same device as PyTorch defaults to above. + +train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator(device=device).manual_seed(2024)) + +print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}") + +######################### +# Now we have a basic dataset containing **20074** examples where each example is a pairing of label and name. We have also +#split the dataset into training and testing so we can validate the model that we build. ###################################################################### @@ -179,113 +228,57 @@ def lineToTensor(line): # graph itself. This means you can implement a RNN in a very "pure" way, # as regular feed-forward layers. # -# This RNN module implements a "vanilla RNN" an is just 3 linear layers -# which operate on an input and hidden state, with a ``LogSoftmax`` layer -# after the output. +# This CharRNN class implements an RNN with three components. +# First, we use the `nn.RNN implementation `__. +# Next, we define a layer that maps the RNN hidden layers to our output. And finally, we apply a ``softmax`` function. Using ``nn.RNN`` +# leads to a significant improvement in performance, such as cuDNN-accelerated kernels, versus implementing +# each layer as a ``nn.Linear``. It also simplifies the implementation in ``forward()``. # import torch.nn as nn import torch.nn.functional as F -class RNN(nn.Module): +class CharRNN(nn.Module): def __init__(self, input_size, hidden_size, output_size): - super(RNN, self).__init__() - - self.hidden_size = hidden_size + super(CharRNN, self).__init__() - self.i2h = nn.Linear(input_size, hidden_size) - self.h2h = nn.Linear(hidden_size, hidden_size) + self.rnn = nn.RNN(input_size, hidden_size) self.h2o = nn.Linear(hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=1) - def forward(self, input, hidden): - hidden = F.tanh(self.i2h(input) + self.h2h(hidden)) - output = self.h2o(hidden) + def forward(self, line_tensor): + rnn_out, hidden = self.rnn(line_tensor) + output = self.h2o(hidden[0]) output = self.softmax(output) - return output, hidden - - def initHidden(self): - return torch.zeros(1, self.hidden_size) - -n_hidden = 128 -rnn = RNN(n_letters, n_hidden, n_categories) + return output -###################################################################### -# To run a step of this network we need to pass an input (in our case, the -# Tensor for the current letter) and a previous hidden state (which we -# initialize as zeros at first). We'll get back the output (probability of -# each language) and a next hidden state (which we keep for the next -# step). -# - -input = letterToTensor('A') -hidden = torch.zeros(1, n_hidden) -output, next_hidden = rnn(input, hidden) +########################### +# We can then create an RNN with 58 input nodes, 128 hidden nodes, and 18 outputs: +n_hidden = 128 +rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq)) +print(rnn) ###################################################################### -# For the sake of efficiency we don't want to be creating a new Tensor for -# every step, so we will use ``lineToTensor`` instead of -# ``letterToTensor`` and use slices. This could be further optimized by -# precomputing batches of Tensors. -# +# After that we can pass our Tensor to the RNN to obtain a predicted output. Subsequently, +# we use a helper function, ``label_from_output``, to derive a text label for the class. -input = lineToTensor('Albert') -hidden = torch.zeros(1, n_hidden) +def label_from_output(output, output_labels): + top_n, top_i = output.topk(1) + label_i = top_i[0].item() + return output_labels[label_i], label_i -output, next_hidden = rnn(input[0], hidden) +input = lineToTensor('Albert') +output = rnn(input) #this is equivalent to ``output = rnn.forward(input)`` print(output) - - -###################################################################### -# As you can see the output is a ``<1 x n_categories>`` Tensor, where -# every item is the likelihood of that category (higher is more likely). -# - +print(label_from_output(output, alldata.labels_uniq)) ###################################################################### # # Training # ======== -# Preparing for Training -# ---------------------- -# -# Before going into training we should make a few helper functions. The -# first is to interpret the output of the network, which we know to be a -# likelihood of each category. We can use ``Tensor.topk`` to get the index -# of the greatest value: -# - -def categoryFromOutput(output): - top_n, top_i = output.topk(1) - category_i = top_i[0].item() - return all_categories[category_i], category_i - -print(categoryFromOutput(output)) - - -###################################################################### -# We will also want a quick way to get a training example (a name and its -# language): -# - -import random - -def randomChoice(l): - return l[random.randint(0, len(l) - 1)] - -def randomTrainingExample(): - category = randomChoice(all_categories) - line = randomChoice(category_lines[category]) - category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) - line_tensor = lineToTensor(line) - return category, line, category_tensor, line_tensor - -for i in range(10): - category, line, category_tensor, line_tensor = randomTrainingExample() - print('category =', category, '/ line =', line) ###################################################################### @@ -295,93 +288,67 @@ def randomTrainingExample(): # Now all it takes to train this network is show it a bunch of examples, # have it make guesses, and tell it if it's wrong. # -# For the loss function ``nn.NLLLoss`` is appropriate, since the last -# layer of the RNN is ``nn.LogSoftmax``. -# - -criterion = nn.NLLLoss() - - -###################################################################### -# Each loop of training will: -# -# - Create input and target tensors -# - Create a zeroed initial hidden state -# - Read each letter in and -# -# - Keep hidden state for next letter -# -# - Compare final output to target -# - Back-propagate -# - Return the output and loss -# - -learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn - -def train(category_tensor, line_tensor): - hidden = rnn.initHidden() - - rnn.zero_grad() - - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) - - loss = criterion(output, category_tensor) - loss.backward() - - # Add parameters' gradients to their values, multiplied by learning rate - for p in rnn.parameters(): - p.data.add_(p.grad.data, alpha=-learning_rate) - - return output, loss.item() - - -###################################################################### -# Now we just have to run that with a bunch of examples. Since the -# ``train`` function returns both the output and loss we can print its -# guesses and also keep track of loss for plotting. Since there are 1000s -# of examples we print only every ``print_every`` examples, and take an -# average of the loss. -# - -import time -import math - -n_iters = 100000 -print_every = 5000 -plot_every = 1000 - +# We do this by defining a ``train()`` function which trains the model on a given dataset using minibatches. RNNs +# RNNs are trained similarly to other networks; therefore, for completeness, we include a batched training method here. +# The loop (``for i in batch``) computes the losses for each of the items in the batch before adjusting the +# weights. This operation is repeated until the number of epochs is reached. +import random +import numpy as np + +def train(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()): + """ + Learn on a batch of training_data for a specified number of iterations and reporting thresholds + """ + # Keep track of losses for plotting + current_loss = 0 + all_losses = [] + rnn.train() + optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate) + + start = time.time() + print(f"training on data set with n = {len(training_data)}") + + for iter in range(1, n_epoch + 1): + rnn.zero_grad() # clear the gradients + + # create some minibatches + # we cannot use dataloaders because each of our names is a different length + batches = list(range(len(training_data))) + random.shuffle(batches) + batches = np.array_split(batches, len(batches) //n_batch_size ) + + for idx, batch in enumerate(batches): + batch_loss = 0 + for i in batch: #for each example in this batch + (label_tensor, text_tensor, label, text) = training_data[i] + output = rnn.forward(text_tensor) + loss = criterion(output, label_tensor) + batch_loss += loss + + # optimize parameters + batch_loss.backward() + nn.utils.clip_grad_norm_(rnn.parameters(), 3) + optimizer.step() + optimizer.zero_grad() + + current_loss += batch_loss.item() / len(batch) + + all_losses.append(current_loss / len(batches) ) + if iter % report_every == 0: + print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}") + current_loss = 0 -# Keep track of losses for plotting -current_loss = 0 -all_losses = [] + return all_losses -def timeSince(since): - now = time.time() - s = now - since - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) +########################################################################## +# We can now train a dataset with minibatches for a specified number of epochs. The number of epochs for this +# example is reduced to speed up the build. You can get better results with different parameters. start = time.time() - -for iter in range(1, n_iters + 1): - category, line, category_tensor, line_tensor = randomTrainingExample() - output, loss = train(category_tensor, line_tensor) - current_loss += loss - - # Print ``iter`` number, loss, name and guess - if iter % print_every == 0: - guess, guess_i = categoryFromOutput(output) - correct = '✓' if guess == category else '✗ (%s)' % category - print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct)) - - # Add current loss avg to list of losses - if iter % plot_every == 0: - all_losses.append(current_loss / plot_every) - current_loss = 0 - +all_losses = train(rnn, train_set, n_epoch=27, learning_rate=0.15, report_every=5) +end = time.time() +print(f"training took {end-start}s") ###################################################################### # Plotting the Results @@ -396,7 +363,7 @@ def timeSince(since): plt.figure() plt.plot(all_losses) - +plt.show() ###################################################################### # Evaluating the Results @@ -409,47 +376,44 @@ def timeSince(since): # ``evaluate()``, which is the same as ``train()`` minus the backprop. # -# Keep track of correct guesses in a confusion matrix -confusion = torch.zeros(n_categories, n_categories) -n_confusion = 10000 +def evaluate(rnn, testing_data, classes): + confusion = torch.zeros(len(classes), len(classes)) -# Just return an output given a line -def evaluate(line_tensor): - hidden = rnn.initHidden() + rnn.eval() #set to eval mode + with torch.no_grad(): # do not record the gradients during eval phase + for i in range(len(testing_data)): + (label_tensor, text_tensor, label, text) = testing_data[i] + output = rnn(text_tensor) + guess, guess_i = label_from_output(output, classes) + label_i = classes.index(label) + confusion[label_i][guess_i] += 1 - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) + # Normalize by dividing every row by its sum + for i in range(len(classes)): + denom = confusion[i].sum() + if denom > 0: + confusion[i] = confusion[i] / denom - return output + # Set up plot + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(confusion.cpu().numpy()) #numpy uses cpu here so we need to use a cpu version + fig.colorbar(cax) -# Go through a bunch of examples and record which are correctly guessed -for i in range(n_confusion): - category, line, category_tensor, line_tensor = randomTrainingExample() - output = evaluate(line_tensor) - guess, guess_i = categoryFromOutput(output) - category_i = all_categories.index(category) - confusion[category_i][guess_i] += 1 + # Set up axes + ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90) + ax.set_yticks(np.arange(len(classes)), labels=classes) -# Normalize by dividing every row by its sum -for i in range(n_categories): - confusion[i] = confusion[i] / confusion[i].sum() + # Force label at every tick + ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) + ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) -# Set up plot -fig = plt.figure() -ax = fig.add_subplot(111) -cax = ax.matshow(confusion.numpy()) -fig.colorbar(cax) + # sphinx_gallery_thumbnail_number = 2 + plt.show() -# Set up axes -ax.set_xticklabels([''] + all_categories, rotation=90) -ax.set_yticklabels([''] + all_categories) -# Force label at every tick -ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) -ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) -# sphinx_gallery_thumbnail_number = 2 -plt.show() +evaluate(rnn, test_set, classes=alldata.labels_uniq) ###################################################################### @@ -460,72 +424,20 @@ def evaluate(line_tensor): # -###################################################################### -# Running on User Input -# --------------------- -# - -def predict(input_line, n_predictions=3): - print('\n> %s' % input_line) - with torch.no_grad(): - output = evaluate(lineToTensor(input_line)) - - # Get top N categories - topv, topi = output.topk(n_predictions, 1, True) - predictions = [] - - for i in range(n_predictions): - value = topv[0][i].item() - category_index = topi[0][i].item() - print('(%.2f) %s' % (value, all_categories[category_index])) - predictions.append([value, all_categories[category_index]]) - -predict('Dovesky') -predict('Jackson') -predict('Satoshi') - - -###################################################################### -# The final versions of the scripts `in the Practical PyTorch -# repo `__ -# split the above code into a few files: -# -# - ``data.py`` (loads files) -# - ``model.py`` (defines the RNN) -# - ``train.py`` (runs training) -# - ``predict.py`` (runs ``predict()`` with command line arguments) -# - ``server.py`` (serve prediction as a JSON API with ``bottle.py``) -# -# Run ``train.py`` to train and save the network. -# -# Run ``predict.py`` with a name to view predictions: -# -# .. code-block:: sh -# -# $ python predict.py Hazaki -# (-0.42) Japanese -# (-1.39) Polish -# (-3.51) Czech -# -# Run ``server.py`` and visit http://localhost:5533/Yourname to get JSON -# output of predictions. -# - - ###################################################################### # Exercises # ========= # -# - Try with a different dataset of line -> category, for example: -# -# - Any word -> language -# - First name -> gender -# - Character name -> writer -# - Page title -> blog or subreddit -# # - Get better results with a bigger and/or better shaped network # -# - Add more linear layers +# - Adjust the hyperparameters to enhance performance, such as changing the number of epochs, batch size, and learning rate # - Try the ``nn.LSTM`` and ``nn.GRU`` layers +# - Modify the size of the layers, such as increasing or decreasing the number of hidden nodes or adding additional linear layers # - Combine multiple of these RNNs as a higher level network # +# - Try with a different dataset of line -> label, for example: +# +# - Any word -> language +# - First name -> gender +# - Character name -> writer +# - Page title -> blog or subreddit diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py index f7db4769ed8..50a6afa11b7 100644 --- a/intermediate_source/char_rnn_generation_tutorial.py +++ b/intermediate_source/char_rnn_generation_tutorial.py @@ -4,6 +4,12 @@ ************************************************************* **Author**: `Sean Robertson `_ +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + This is our second of three tutorials on "NLP From Scratch". In the `first tutorial `_ we used a RNN to classify names into their language of origin. This time diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst new file mode 100644 index 00000000000..1091b19a49e --- /dev/null +++ b/intermediate_source/compiled_autograd_tutorial.rst @@ -0,0 +1,221 @@ +Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` +========================================================================== +**Author:** `Simon Fan `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How compiled autograd interacts with ``torch.compile`` + * How to use the compiled autograd API + * How to inspect logs using ``TORCH_LOGS`` + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 + * Complete the `Introduction to torch.compile `_ + * Read through the TorchDynamo and AOTAutograd sections of `Get Started with PyTorch 2.x `_ + +Overview +-------- +Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 +that allows the capture of a larger backward graph. + +While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: + +* Graph breaks in the forward lead to graph breaks in the backward +* `Backward hooks `_ are not captured + +Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing +it to capture the full backward graph at runtime. Models with these two characteristics should try +Compiled Autograd, and potentially observe better performance. + +However, Compiled Autograd introduces its own limitations: + +* Added runtime overhead at the start of the backward for cache lookup +* More prone to recompiles and graph breaks in dynamo due to the larger capture + +.. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. + +Setup +----- +In this tutorial, we will base our examples on this simple neural network model. +It takes a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. + +.. code:: python + + import torch + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + +Basic usage +------------ +Before calling the ``torch.compile`` API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: + +.. code:: python + + model = Model() + x = torch.randn(10) + + torch._dynamo.config.compiled_autograd = True + @torch.compile + def train(model, x): + loss = model(x).sum() + loss.backward() + + train(model, x) + +In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using ``torch.randn(10)``. +We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. +When ``train(model, x)`` is called: + +* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``. +* Dynamo intercepts the Python bytecode, simulates their execution and records the operations into a graph. +* ``AOTDispatcher`` disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward. +* Dynamo sets the optimized function to be evaluated next by Python Interpreter. +* Python Interpreter executes the optimized function, which executes ``loss = model(x).sum()``. +* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we set ``torch._dynamo.config.compiled_autograd = True``. +* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this process, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully-traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode. +* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher will not need to partition the graph. + +Inspecting the compiled autograd logs +------------------------------------- +Run the script with the ``TORCH_LOGS`` environment variables: + +* To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +* To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` + +Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, +these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. + +In the image below, the red box encapsulates the AOT backward graph that is captured by ``torch.compile`` without Compiled Autograd. + + +.. image:: ../_static/img/compiled_autograd/entire_verbose_log.png + +.. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. + +Compiling the forward and backward pass using different flags +------------------------------------------------------------- +You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. + +.. code:: python + + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True + torch.compile(lambda: loss.backward(), fullgraph=True)() + +Or you can use the context manager, which will apply to all autograd calls within its scope. + +.. code:: python + + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)): + loss.backward() + + +Compiled Autograd addresses certain limitations of AOTAutograd +-------------------------------------------------------------- +1. Graph breaks in the forward pass no longer necessarily lead to graph breaks in the backward pass: + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + # 1st graph + temp = x + 10 + torch._dynamo.graph_break() + # 2nd graph + temp = temp + 10 + torch._dynamo.graph_break() + # 3rd graph + return temp.sum() + + x = torch.randn(10, 10, requires_grad=True) + torch._dynamo.utils.counters.clear() + loss = fn(x) + + # 1. base torch.compile + loss.backward(retain_graph=True) + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3) + torch._dynamo.utils.counters.clear() + + # 2. torch.compile with compiled autograd + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + + # single graph for the backward + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) + + +In the first ``torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. +Whereas in the second ``torch.compile`` with compiled autograd case, we see that a full backward graph was traced despite the graph breaks. + +.. note:: It is still possible for the Dynamo to graph break when tracing backward hooks captured by Compiled Autograd. + + +2. Backward hooks can now be captured + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + return x.sum() + + x = torch.randn(10, 10, requires_grad=True) + x.register_hook(lambda grad: grad+10) + loss = fn(x) + + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + +There should be a ``call_hook`` node in the graph, which dynamo will later inline into the following: + +.. image:: ../_static/img/compiled_autograd/call_hook_node.png + +Common recompilation reasons for Compiled Autograd +-------------------------------------------------- +1. Due to changes in the autograd structure of the loss value: + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + x = torch.randn(10, requires_grad=True) + for op in [torch.add, torch.sub, torch.mul, torch.div]: + loss = op(x, x).sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +In the example above, we call a different operator on each iteration, leading to ``loss`` tracking a different autograd history each time. You should see some recompile messages: **Cache miss due to new autograd node**. + +.. image:: ../_static/img/compiled_autograd/recompile_due_to_node.png + +2. Due to tensors changing shapes: + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + for i in [10, 100, 10]: + x = torch.randn(i, i, requires_grad=True) + loss = x.sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +In the example above, ``x`` changes shapes, and compiled autograd will mark ``x`` as a dynamic shape tensor after the first change. You should see recompiles messages: **Cache miss due to changed shapes**. + +.. image:: ../_static/img/compiled_autograd/recompile_due_to_dynamic.png + +Conclusion +---------- +In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. Stay tuned for deep dives on `dev-discuss `_. diff --git a/intermediate_source/ddp_series_minGPT.rst b/intermediate_source/ddp_series_minGPT.rst index 259db3623c6..27e63996f75 100644 --- a/intermediate_source/ddp_series_minGPT.rst +++ b/intermediate_source/ddp_series_minGPT.rst @@ -6,7 +6,7 @@ training `__ \|\| **minGPT Training** Training “real-world” models with DDP ===================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 @@ -26,10 +26,11 @@ Authors: `Suraj Subramanian `__ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites :class-card: card-prerequisites - - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ - - [Optional] Familiarity with `multinode training `__ - - 2 or more TCP-reachable GPU machines (this tutorial uses AWS p3.2xlarge instances) - PyTorch `installed `__ with CUDA on all machines + - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ + - [Optional] Familiarity with `multinode training `__ + - 2 or more TCP-reachable GPU machines for multi-node training (this tutorial uses AWS p3.2xlarge instances) + Follow along with the video below or on `youtube `__. @@ -63,25 +64,23 @@ from any node that has access to the cloud bucket. Using Mixed Precision ~~~~~~~~~~~~~~~~~~~~~~~~ -To speed things up, you might be able to use `Mixed Precision `__ to train your models. -In Mixed Precision, some parts of the training process are carried out in reduced precision, while other steps -that are more sensitive to precision drops are maintained in FP32 precision. +To speed things up, you might be able to use `Mixed Precision `__ to train your models. +In Mixed Precision, some parts of the training process are carried out in reduced precision, while other steps +that are more sensitive to precision drops are maintained in FP32 precision. When is DDP not enough? ~~~~~~~~~~~~~~~~~~~~~~~~ A typical training run's memory footprint consists of model weights, activations, gradients, the input batch, and the optimizer state. -Since DDP replicates the model on each GPU, it only works when GPUs have sufficient capacity to accomodate the full footprint. +Since DDP replicates the model on each GPU, it only works when GPUs have sufficient capacity to accomodate the full footprint. When models grow larger, more aggressive techniques might be useful: -- `activation checkpointing `__: Instead of saving intermediate activations during the forward pass, the activations are recomputed during the backward pass. In this approach, we run more compute but save on memory footprint. -- `Fully-Sharded Data Parallel `__: Here the model is not replicated but "sharded" across all the GPUs, and computation is overlapped with communication in the forward and backward passes. Read our `blog `__ to learn how we trained a 1 Trillion parameter model with FSDP. - +- `Activation checkpointing `__: Instead of saving intermediate activations during the forward pass, the activations are recomputed during the backward pass. In this approach, we run more compute but save on memory footprint. +- `Fully-Sharded Data Parallel `__: Here the model is not replicated but "sharded" across all the GPUs, and computation is overlapped with communication in the forward and backward passes. Read our `blog `__ to learn how we trained a 1 Trillion parameter model with FSDP. Further Reading --------------- - `Multi-Node training with DDP `__ (previous tutorial in this series) - `Mixed Precision training `__ -- `Fully-Sharded Data Parallel `__ +- `Fully-Sharded Data Parallel tutorial `__ - `Training a 1T parameter model with FSDP `__ -- `FSDP Video Tutorial Series `__ diff --git a/intermediate_source/ddp_series_multinode.rst b/intermediate_source/ddp_series_multinode.rst index 5717589bdaa..8746eb19bbd 100644 --- a/intermediate_source/ddp_series_multinode.rst +++ b/intermediate_source/ddp_series_multinode.rst @@ -6,7 +6,7 @@ training** \|\| `minGPT Training `__ Multinode Training ================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst index 13297fb2a12..c63321ad14c 100644 --- a/intermediate_source/ddp_tutorial.rst +++ b/intermediate_source/ddp_tutorial.rst @@ -2,7 +2,7 @@ Getting Started with Distributed Data Parallel ================================================= **Author**: `Shen Li `_ -**Edited by**: `Joe Zhu `_ +**Edited by**: `Joe Zhu `_, `Chirag Pandya `__ .. note:: |edit| View and edit this tutorial in `github `__. @@ -15,24 +15,30 @@ Prerequisites: `DistributedDataParallel `__ -(DDP) implements data parallelism at the module level which can run across -multiple machines. Applications using DDP should spawn multiple processes and -create a single DDP instance per process. DDP uses collective communications in the +(DDP) is a powerful module in PyTorch that allows you to parallelize your model across +multiple machines, making it perfect for large-scale deep learning applications. +To use DDP, you'll need to spawn multiple processes and create a single instance of DDP per process. + +But how does it work? DDP uses collective communications from the `torch.distributed `__ -package to synchronize gradients and buffers. More specifically, DDP registers -an autograd hook for each parameter given by ``model.parameters()`` and the -hook will fire when the corresponding gradient is computed in the backward -pass. Then DDP uses that signal to trigger gradient synchronization across -processes. Please refer to -`DDP design note `__ for more details. +package to synchronize gradients and buffers across all processes. This means that each process will have +its own copy of the model, but they'll all work together to train the model as if it were on a single machine. + +To make this happen, DDP registers an autograd hook for each parameter in the model. +When the backward pass is run, this hook fires and triggers gradient synchronization across all processes. +This ensures that each process has the same gradients, which are then used to update the model. + +For more information on how DDP works and how to use it effectively, be sure to check out the +`DDP design note `__. +With DDP, you can train your models faster and more efficiently than ever before! +The recommended way to use DDP is to spawn one process for each model replica. The model replica can span +multiple devices. DDP processes can be placed on the same machine or across machines. Note that GPU devices +cannot be shared across DDP processes (i.e. one GPU for one DDP process). -The recommended way to use DDP is to spawn one process for each model replica, -where a model replica can span multiple devices. DDP processes can be -placed on the same machine or across machines, but GPU devices cannot be -shared across processes. This tutorial starts from a basic DDP use case and -then demonstrates more advanced use cases including checkpointing models and -combining DDP with model parallel. + +In this tutorial, we'll start with a basic DDP use case and then demonstrate more advanced use cases, +including checkpointing models and combining DDP with model parallel. .. note:: @@ -43,25 +49,22 @@ combining DDP with model parallel. Comparison between ``DataParallel`` and ``DistributedDataParallel`` ------------------------------------------------------------------- -Before we dive in, let's clarify why, despite the added complexity, you would -consider using ``DistributedDataParallel`` over ``DataParallel``: +Before we dive in, let's clarify why you would consider using ``DistributedDataParallel`` +over ``DataParallel``, despite its added complexity: -- First, ``DataParallel`` is single-process, multi-thread, and only works on a - single machine, while ``DistributedDataParallel`` is multi-process and works - for both single- and multi- machine training. ``DataParallel`` is usually - slower than ``DistributedDataParallel`` even on a single machine due to GIL - contention across threads, per-iteration replicated model, and additional - overhead introduced by scattering inputs and gathering outputs. +- First, ``DataParallel`` is single-process, multi-threaded, but it only works on a + single machine. In contrast, ``DistributedDataParallel`` is multi-process and supports + both single- and multi- machine training. + Due to GIL contention across threads, per-iteration replicated model, and additional overhead introduced by + scattering inputs and gathering outputs, ``DataParallel`` is usually + slower than ``DistributedDataParallel`` even on a single machine. - Recall from the `prior tutorial `__ that if your model is too large to fit on a single GPU, you must use **model parallel** to split it across multiple GPUs. ``DistributedDataParallel`` works with - **model parallel**; ``DataParallel`` does not at this time. When DDP is combined + **model parallel**, while ``DataParallel`` does not at this time. When DDP is combined with model parallel, each DDP process would use model parallel, and all processes collectively would use data parallel. -- If your model needs to span multiple machines or if your use case does not fit - into data parallelism paradigm, please see `the RPC API `__ - for more generic distributed training support. Basic Use Case -------------- @@ -99,8 +102,12 @@ be found in os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' + # We want to be able to train our model on an `accelerator `__ + # such as CUDA, MPS, MTIA, or XPU. + acc = torch.accelerator.current_accelerator() + backend = torch.distributed.get_default_backend_for_device(acc) # initialize the process group - dist.init_process_group("gloo", rank=rank, world_size=world_size) + dist.init_process_group(backend, rank=rank, world_size=world_size) def cleanup(): dist.destroy_process_group() @@ -141,6 +148,7 @@ different DDP processes starting from different initial model parameter values. optimizer.step() cleanup() + print(f"Finished running basic DDP example on rank {rank}.") def run_demo(demo_fn, world_size): @@ -154,7 +162,7 @@ provides a clean API as if it were a local model. Gradient synchronization communications take place during the backward pass and overlap with the backward computation. When the ``backward()`` returns, ``param.grad`` already contains the synchronized gradient tensor. For basic use cases, DDP only -requires a few more LoCs to set up the process group. When applying DDP to more +requires a few more lines of code to set up the process group. When applying DDP to more advanced use cases, some caveats require caution. Skewed Processing Speeds @@ -179,13 +187,14 @@ It's common to use ``torch.save`` and ``torch.load`` to checkpoint modules during training and recover from checkpoints. See `SAVING AND LOADING MODELS `__ for more details. When using DDP, one optimization is to save the model in -only one process and then load it to all processes, reducing write overhead. -This is correct because all processes start from the same parameters and +only one process and then load it on all processes, reducing write overhead. +This works because all processes start from the same parameters and gradients are synchronized in backward passes, and hence optimizers should keep -setting parameters to the same values. If you use this optimization, make sure no process starts +setting parameters to the same values. +If you use this optimization (i.e. save on one process but restore on all), make sure no process starts loading before the saving is finished. Additionally, when loading the module, you need to provide an appropriate ``map_location`` -argument to prevent a process from stepping into others' devices. If ``map_location`` +argument to prevent processes from stepping into others' devices. If ``map_location`` is missing, ``torch.load`` will first load the module to CPU and then copy each parameter to where it was saved, which would result in all processes on the same machine using the same set of devices. For more advanced failure recovery @@ -211,14 +220,17 @@ and elasticity support, please refer to `TorchElastic `__ + # such as CUDA, MPS, MTIA, or XPU. + acc = torch.accelerator.current_accelerator() # configure map_location properly - map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} + map_location = {f'{acc}:0': f'{acc}:{rank}'} ddp_model.load_state_dict( - torch.load(CHECKPOINT_PATH, map_location=map_location)) + torch.load(CHECKPOINT_PATH, map_location=map_location, weights_only=True)) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) - + optimizer.zero_grad() outputs = ddp_model(torch.randn(20, 10)) labels = torch.randn(20, 5).to(rank) @@ -234,6 +246,7 @@ and elasticity support, please refer to `TorchElastic = 2, f"Requires at least 2 GPUs to run, but got {n_gpus}" world_size = n_gpus run_demo(demo_basic, world_size) @@ -304,6 +318,7 @@ Let's still use the Toymodel example and create a file named ``elastic_ddp.py``. .. code:: python + import os import torch import torch.distributed as dist import torch.nn as nn @@ -323,15 +338,16 @@ Let's still use the Toymodel example and create a file named ``elastic_ddp.py``. def demo_basic(): - dist.init_process_group("nccl") + torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"])) + acc = torch.accelerator.current_accelerator() + backend = torch.distributed.get_default_backend_for_device(acc) + dist.init_process_group(backend) rank = dist.get_rank() print(f"Start running basic DDP example on rank {rank}.") - # create model and move it to GPU with id rank - device_id = rank % torch.cuda.device_count() + device_id = rank % torch.accelerator.device_count() model = ToyModel().to(device_id) ddp_model = DDP(model, device_ids=[device_id]) - loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) @@ -341,22 +357,23 @@ Let's still use the Toymodel example and create a file named ``elastic_ddp.py``. loss_fn(outputs, labels).backward() optimizer.step() dist.destroy_process_group() - + print(f"Finished running basic DDP example on rank {rank}.") + if __name__ == "__main__": demo_basic() -One can then run a `torch elastic/torchrun `__ command +One can then run a `torch elastic/torchrun `__ command on all nodes to initialize the DDP job created above: .. code:: bash torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py -We are running the DDP script on two hosts, and each host we run with 8 processes, aka, we -are running it on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes. +In the example above, we are running the DDP script on two hosts and we run with 8 processes on each host. That is, we +are running this job on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes. -Here torchrun will launch 8 process and invoke ``elastic_ddp.py`` -on each process on the node it is launched on, but user also needs to apply cluster +Here ``torchrun`` will launch 8 processes and invoke ``elastic_ddp.py`` +on each process on the node it is launched on, but user also needs to apply cluster management tools like slurm to actually run this command on 2 nodes. For example, on a SLURM enabled cluster, we can write a script to run the command above @@ -368,8 +385,8 @@ and set ``MASTER_ADDR`` as: Then we can just run this script using the SLURM command: ``srun --nodes=2 ./torchrun_script.sh``. -Of course, this is just an example; you can choose your own cluster scheduling tools -to initiate the torchrun job. -For more information about Elastic run, one can check this -`quick start document `__ to learn more. +This is just an example; you can choose your own cluster scheduling tools to initiate the ``torchrun`` job. + +For more information about Elastic run, please see the +`quick start document `__. diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst new file mode 100644 index 00000000000..ec3e3cf304a --- /dev/null +++ b/intermediate_source/dist_pipeline_parallel_tutorial.rst @@ -0,0 +1,10 @@ +Distributed Pipeline Parallelism Using RPC +========================================== + +This tutorial has been deprecated. + +Redirecting to a newer tutorial in 3 seconds... + +.. raw:: html + + diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst index 35f6341395f..cebc986a190 100644 --- a/intermediate_source/dist_tuto.rst +++ b/intermediate_source/dist_tuto.rst @@ -38,7 +38,7 @@ simultaneously. If you have access to compute cluster you should check with your local sysadmin or use your favorite coordination tool (e.g., `pdsh `__, `clustershell `__, or -`others `__). For the purpose of this +`slurm `__). For the purpose of this tutorial, we will use a single machine and spawn multiple processes using the following template. @@ -47,6 +47,7 @@ the following template. """run.py:""" #!/usr/bin/env python import os + import sys import torch import torch.distributed as dist import torch.multiprocessing as mp @@ -64,11 +65,15 @@ the following template. if __name__ == "__main__": - size = 2 + world_size = 2 processes = [] - mp.set_start_method("spawn") - for rank in range(size): - p = mp.Process(target=init_process, args=(rank, size, run)) + if "google.colab" in sys.modules: + print("Running in Google Colab") + mp.get_context("spawn") + else: + mp.set_start_method("spawn") + for rank in range(world_size): + p = mp.Process(target=init_process, args=(rank, world_size, run)) p.start() processes.append(p) @@ -125,7 +130,7 @@ process 0 increments the tensor and sends it to process 1 so that they both end up with 1.0. Notice that process 1 needs to allocate memory in order to store the data it will receive. -Also notice that ``send``/``recv`` are **blocking**: both processes stop +Also notice that ``send/recv`` are **blocking**: both processes block until the communication is completed. On the other hand immediates are **non-blocking**; the script continues its execution and the methods return a ``Work`` object upon which we can choose to @@ -156,7 +161,8 @@ we should not modify the sent tensor nor access the received tensor before ``req In other words, - writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour. -- reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour. +- reading from ``tensor`` after ``dist.irecv()`` will result in undefined + behaviour, until ``req.wait()`` has been executed. However, after ``req.wait()`` has been executed we are guaranteed that the communication took place, @@ -219,16 +225,23 @@ to obtain the sum of all tensors on all processes, we can use the Since we want the sum of all tensors in the group, we use ``dist.ReduceOp.SUM`` as the reduce operator. Generally speaking, any commutative mathematical operation can be used as an operator. -Out-of-the-box, PyTorch comes with 4 such operators, all working at the +Out-of-the-box, PyTorch comes with many such operators, all working at the element-wise level: - ``dist.ReduceOp.SUM``, - ``dist.ReduceOp.PRODUCT``, - ``dist.ReduceOp.MAX``, -- ``dist.ReduceOp.MIN``. +- ``dist.ReduceOp.MIN``, +- ``dist.ReduceOp.BAND``, +- ``dist.ReduceOp.BOR``, +- ``dist.ReduceOp.BXOR``, +- ``dist.ReduceOp.PREMUL_SUM``. + +The full list of supported operators is +`here `__. -In addition to ``dist.all_reduce(tensor, op, group)``, there are a total -of 6 collectives currently implemented in PyTorch. +In addition to ``dist.all_reduce(tensor, op, group)``, there are many additional collectives currently implemented in +PyTorch. Here are a few supported collectives. - ``dist.broadcast(tensor, src, group)``: Copies ``tensor`` from ``src`` to all other processes. @@ -244,6 +257,12 @@ of 6 collectives currently implemented in PyTorch. - ``dist.all_gather(tensor_list, tensor, group)``: Copies ``tensor`` from all processes to ``tensor_list``, on all processes. - ``dist.barrier(group)``: Blocks all processes in `group` until each one has entered this function. +- ``dist.all_to_all(output_tensor_list, input_tensor_list, group)``: Scatters list of input tensors to all processes in + a group and return gathered list of tensors in output list. + +The full list of supported collectives can be found by looking at the latest documentation for PyTorch Distributed +`(link) `__. + Distributed Training -------------------- @@ -275,7 +294,7 @@ gradients of their model on their batch of data and then average their gradients. In order to ensure similar convergence results when changing the number of processes, we will first have to partition our dataset. (You could also use -`tnt.dataset.SplitDataset `__, +`torch.utils.data.random_split `__, instead of the snippet below.) .. code:: python @@ -389,7 +408,7 @@ could train any model on a large computer cluster. lot more tricks `__ required to implement a production-level implementation of synchronous SGD. Again, use what `has been tested and -optimized `__. +optimized `__. Our Own Ring-Allreduce ~~~~~~~~~~~~~~~~~~~~~~ @@ -451,8 +470,10 @@ Communication Backends One of the most elegant aspects of ``torch.distributed`` is its ability to abstract and build on top of different backends. As mentioned before, -there are currently three backends implemented in PyTorch: Gloo, NCCL, and -MPI. They each have different specifications and tradeoffs, depending +there are multiple backends implemented in PyTorch. These backends can be easily selected +using the `Accelerator API `__, +which provides a interface for working with different accelerator types. +Some of the most popular backends are Gloo, NCCL, and MPI. They each have different specifications and tradeoffs, depending on the desired use case. A comparative table of supported functions can be found `here `__. @@ -472,12 +493,13 @@ distributed SGD example does not work if you put ``model`` on the GPU. In order to use multiple GPUs, let us also make the following modifications: -1. Use ``device = torch.device("cuda:{}".format(rank))`` -2. ``model = Net()`` :math:`\rightarrow` ``model = Net().to(device)`` -3. Use ``data, target = data.to(device), target.to(device)`` +1. Use Accelerator API ``device_type = torch.accelerator.current_accelerator()`` +2. Use ``torch.device(f"{device_type}:{rank}")`` +3. ``model = Net()`` :math:`\rightarrow` ``model = Net().to(device)`` +4. Use ``data, target = data.to(device), target.to(device)`` -With the above modifications, our model is now training on two GPUs and -you can monitor their utilization with ``watch nvidia-smi``. +With these modifications, your model will now train across two GPUs. +You can monitor GPU utilization using ``watch nvidia-smi`` if you are running on NVIDIA hardware. **MPI Backend** @@ -533,6 +555,7 @@ more `__) Doing so, you should obtain the same familiar output as with the other communication backends. + **NCCL Backend** The `NCCL backend `__ provides an @@ -541,18 +564,26 @@ tensors. If you only use CUDA tensors for your collective operations, consider using this backend for the best in class performance. The NCCL backend is included in the pre-built binaries with CUDA support. +**XCCL Backend** + +The `XCCL backend` offers an optimized implementation of collective operations for XPU tensors. +If your workload uses only XPU tensors for collective operations, +this backend provides best-in-class performance. +The XCCL backend is included in the pre-built binaries with XPU support. + + Initialization Methods ~~~~~~~~~~~~~~~~~~~~~~ -To finish this tutorial, let's talk about the very first function we -called: ``dist.init_process_group(backend, init_method)``. In -particular, we will go over the different initialization methods which -are responsible for the initial coordination step between each process. -Those methods allow you to define how this coordination is done. -Depending on your hardware setup, one of these methods should be -naturally more suitable than the others. In addition to the following -sections, you should also have a look at the `official -documentation `__. +To conclude this tutorial, let's examine the initial function we invoked: +``dist.init_process_group(backend, init_method)``. Specifically, we will discuss the various +initialization methods responsible for the preliminary coordination step between each process. +These methods enable you to define how this coordination is accomplished. + +The choice of initialization method depends on your hardware setup, and one method may be more +suitable than others. In addition to the following sections, please refer to the `official +documentation `__ for further information. + **Environment Variable** @@ -569,7 +600,7 @@ finally handshake with them. - ``WORLD_SIZE``: The total number of processes, so that the master knows how many workers to wait for. - ``RANK``: Rank of each process, so they will know whether it is the - master of a worker. + master or a worker. **Shared File System** diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py index 991a0ff8bd6..bcc484f0a00 100644 --- a/intermediate_source/dqn_with_rnn_tutorial.py +++ b/intermediate_source/dqn_with_rnn_tutorial.py @@ -298,7 +298,7 @@ # either by passing a string or an action-spec. This allows us to use # Categorical (sometimes called "sparse") encoding or the one-hot version of it. # -qval = QValueModule(action_space=env.action_spec) +qval = QValueModule(spec=env.action_spec) ###################################################################### # .. note:: @@ -433,7 +433,7 @@ exploration_module.step(data.numel()) updater.step() - with set_exploration_type(ExplorationType.MODE), torch.no_grad(): + with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad(): rollout = env.rollout(10000, stoch_policy) traj_lens.append(rollout.get(("next", "step_count")).max().item()) diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst deleted file mode 100644 index 1ea6ea46dd0..00000000000 --- a/intermediate_source/dynamic_quantization_bert_tutorial.rst +++ /dev/null @@ -1,568 +0,0 @@ -(beta) Dynamic Quantization on BERT -=========================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. This will allow you to experiment with the information presented below. - -**Author**: `Jianyu Huang `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - - -Introduction ------------- - - -In this tutorial, we will apply the dynamic quantization on a BERT -model, closely following the BERT model from `the HuggingFace -Transformers examples `_. -With this step-by-step journey, we would like to demonstrate how to -convert a well-known state-of-the-art model like BERT into dynamic -quantized model. - -- BERT, or Bidirectional Embedding Representations from Transformers, - is a new method of pre-training language representations which - achieves the state-of-the-art accuracy results on many popular - Natural Language Processing (NLP) tasks, such as question answering, - text classification, and others. The original paper can be found - `here `_. - -- Dynamic quantization support in PyTorch converts a float model to a - quantized model with static int8 or float16 data types for the - weights and dynamic quantization for the activations. The activations - are quantized dynamically (per batch) to int8 when the weights are - quantized to int8. In PyTorch, we have `torch.quantization.quantize_dynamic API - `_, - which replaces specified modules with dynamic weight-only quantized - versions and output the quantized model. - -- We demonstrate the accuracy and inference performance results on the - `Microsoft Research Paraphrase Corpus (MRPC) task `_ - in the General Language Understanding Evaluation benchmark `(GLUE) - `_. The MRPC (Dolan and Brockett, 2005) is - a corpus of sentence pairs automatically extracted from online news - sources, with human annotations of whether the sentences in the pair - are semantically equivalent. As the classes are imbalanced (68% - positive, 32% negative), we follow the common practice and report - `F1 score `_. - MRPC is a common NLP task for language pair classification, as shown - below. - -.. image:: /_static/img/bert.png - - -1. Setup --------- - -1.1 Install PyTorch and HuggingFace Transformers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To start this tutorial, let’s first follow the installation instructions -in PyTorch `here `_ and HuggingFace Github Repo `here `_. -In addition, we also install `scikit-learn `_ package, as we will reuse its -built-in F1 score calculation helper function. - -.. code:: shell - - pip install sklearn - pip install transformers==4.29.2 - - -Because we will be using the beta parts of the PyTorch, it is -recommended to install the latest version of torch and torchvision. You -can find the most recent instructions on local installation `here -`_. For example, to install on -Mac: - -.. code:: shell - - yes y | pip uninstall torch tochvision - yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - - - -1.2 Import the necessary modules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In this step we import the necessary Python modules for the tutorial. - -.. code:: python - - import logging - import numpy as np - import os - import random - import sys - import time - import torch - - from argparse import Namespace - from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) - from tqdm import tqdm - from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,) - from transformers import glue_compute_metrics as compute_metrics - from transformers import glue_output_modes as output_modes - from transformers import glue_processors as processors - from transformers import glue_convert_examples_to_features as convert_examples_to_features - - # Setup logging - logger = logging.getLogger(__name__) - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) - - logging.getLogger("transformers.modeling_utils").setLevel( - logging.WARN) # Reduce logging - - print(torch.__version__) - -We set the number of threads to compare the single thread performance between FP32 and INT8 performance. -In the end of the tutorial, the user can set other number of threads by building PyTorch with right parallel backend. - -.. code:: python - - torch.set_num_threads(1) - print(torch.__config__.parallel_info()) - - -1.3 Learn about helper functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The helper functions are built-in in transformers library. We mainly use -the following helper functions: one for converting the text examples -into the feature vectors; The other one for measuring the F1 score of -the predicted result. - -The `glue_convert_examples_to_features `_ function converts the texts into input features: - -- Tokenize the input sequences; -- Insert [CLS] in the beginning; -- Insert [SEP] between the first sentence and the second sentence, and - in the end; -- Generate token type ids to indicate whether a token belongs to the - first sequence or the second sequence. - -The `glue_compute_metrics `_ function has the compute metrics with -the `F1 score `_, which -can be interpreted as a weighted average of the precision and recall, -where an F1 score reaches its best value at 1 and worst score at 0. The -relative contribution of precision and recall to the F1 score are equal. - -- The equation for the F1 score is: -.. math:: F1 = 2 * (\text{precision} * \text{recall}) / (\text{precision} + \text{recall}) - - -1.4 Download the dataset -^^^^^^^^^^^^^^^^^^^^^^^^ - -Before running MRPC tasks we download the `GLUE data -`_ by running `this script -`_ -and unpack it to a directory ``glue_data``. - - -.. code:: shell - - python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' - - -2. Fine-tune the BERT model ---------------------------- - -The spirit of BERT is to pre-train the language representations and then -to fine-tune the deep bi-directional representations on a wide range of -tasks with minimal task-dependent parameters, and achieves -state-of-the-art results. In this tutorial, we will focus on fine-tuning -with the pre-trained BERT model to classify semantically equivalent -sentence pairs on MRPC task. - -To fine-tune the pre-trained BERT model (``bert-base-uncased`` model in -HuggingFace transformers) for the MRPC task, you can follow the command -in `examples `_: - -.. code:: python - - export GLUE_DIR=./glue_data - export TASK_NAME=MRPC - export OUT_DIR=./$TASK_NAME/ - python ./run_glue.py \ - --model_type bert \ - --model_name_or_path bert-base-uncased \ - --task_name $TASK_NAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/$TASK_NAME \ - --max_seq_length 128 \ - --per_gpu_eval_batch_size=8 \ - --per_gpu_train_batch_size=8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --save_steps 100000 \ - --output_dir $OUT_DIR - -We provide the fined-tuned BERT model for MRPC task `here `_. -To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``. - -2.1 Set global configurations -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Here we set the global configurations for evaluating the fine-tuned BERT -model before and after the dynamic quantization. - -.. code:: python - - configs = Namespace() - - # The output directory for the fine-tuned model, $OUT_DIR. - configs.output_dir = "./MRPC/" - - # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME. - configs.data_dir = "./glue_data/MRPC" - - # The model name or path for the pre-trained model. - configs.model_name_or_path = "bert-base-uncased" - # The maximum length of an input sequence - configs.max_seq_length = 128 - - # Prepare GLUE task. - configs.task_name = "MRPC".lower() - configs.processor = processors[configs.task_name]() - configs.output_mode = output_modes[configs.task_name] - configs.label_list = configs.processor.get_labels() - configs.model_type = "bert".lower() - configs.do_lower_case = True - - # Set the device, batch size, topology, and caching flags. - configs.device = "cpu" - configs.per_gpu_eval_batch_size = 8 - configs.n_gpu = 0 - configs.local_rank = -1 - configs.overwrite_cache = False - - - # Set random seed for reproducibility. - def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - set_seed(42) - - - -2.2 Load the fine-tuned BERT model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We load the tokenizer and fine-tuned BERT sequence classifier model -(FP32) from the ``configs.output_dir``. - -.. code:: python - - tokenizer = BertTokenizer.from_pretrained( - configs.output_dir, do_lower_case=configs.do_lower_case) - - model = BertForSequenceClassification.from_pretrained(configs.output_dir) - model.to(configs.device) - - -2.3 Define the tokenize and evaluation function -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We reuse the tokenize and evaluation function from `Huggingface `_. - -.. code:: python - - # coding=utf-8 - # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - - def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - -3. Apply the dynamic quantization ---------------------------------- - -We call ``torch.quantization.quantize_dynamic`` on the model to apply -the dynamic quantization on the HuggingFace BERT model. Specifically, - -- We specify that we want the torch.nn.Linear modules in our model to - be quantized; -- We specify that we want weights to be converted to quantized int8 - values. - -.. code:: python - - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - - -3.1 Check the model size -^^^^^^^^^^^^^^^^^^^^^^^^ - -Let’s first check the model size. We can observe a significant reduction -in model size (FP32 total size: 438 MB; INT8 total size: 181 MB): - -.. code:: python - - def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - - print_size_of_model(model) - print_size_of_model(quantized_model) - - -The BERT model used in this tutorial (``bert-base-uncased``) has a -vocabulary size V of 30522. With the embedding size of 768, the total -size of the word embedding table is ~ 4 (Bytes/FP32) \* 30522 \* 768 = -90 MB. So with the help of quantization, the model size of the -non-embedding table part is reduced from 350 MB (FP32 model) to 90 MB -(INT8 model). - - -3.2 Evaluate the inference accuracy and time -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Next, let’s compare the inference time as well as the evaluation -accuracy between the original FP32 model and the INT8 model after the -dynamic quantization. - -.. code:: python - - def time_model_evaluation(model, configs, tokenizer): - eval_start_time = time.time() - result = evaluate(configs, model, tokenizer, prefix="") - eval_end_time = time.time() - eval_duration_time = eval_end_time - eval_start_time - print(result) - print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time)) - - # Evaluate the original FP32 BERT model - time_model_evaluation(model, configs, tokenizer) - - # Evaluate the INT8 BERT model after the dynamic quantization - time_model_evaluation(quantized_model, configs, tokenizer) - - -Running this locally on a MacBook Pro, without quantization, inference -(for all 408 examples in MRPC dataset) takes about 160 seconds, and with -quantization it takes just about 90 seconds. We summarize the results -for running the quantized BERT model inference on a Macbook Pro as the -follows: - -.. code:: - - | Prec | F1 score | Model Size | 1 thread | 4 threads | - | FP32 | 0.9019 | 438 MB | 160 sec | 85 sec | - | INT8 | 0.902 | 181 MB | 90 sec | 46 sec | - -We have 0.6% lower F1 score accuracy after applying the post-training dynamic -quantization on the fine-tuned BERT model on the MRPC task. As a -comparison, in a `recent paper `_ (Table 1), -it achieved 0.8788 by -applying the post-training dynamic quantization and 0.8956 by applying -the quantization-aware training. The main difference is that we support the -asymmetric quantization in PyTorch while that paper supports the -symmetric quantization only. - -Note that we set the number of threads to 1 for the single-thread -comparison in this tutorial. We also support the intra-op -parallelization for these quantized INT8 operators. The users can now -set multi-thread by ``torch.set_num_threads(N)`` (``N`` is the number of -intra-op parallelization threads). One preliminary requirement to enable -the intra-op parallelization support is to build PyTorch with the right -`backend `_ -such as OpenMP, Native or TBB. -You can use ``torch.__config__.parallel_info()`` to check the -parallelization settings. On the same MacBook Pro using PyTorch with -Native backend for parallelization, we can get about 46 seconds for -processing the evaluation of MRPC dataset. - - -3.3 Serialize the quantized model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We can serialize and save the quantized model for the future use using -`torch.jit.save` after tracing the model. - -.. code:: python - - def ids_tensor(shape, vocab_size): - # Creates a random int32 tensor of the shape within the vocab size - return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu') - - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - dummy_input = (input_ids, attention_mask, token_type_ids) - traced_model = torch.jit.trace(quantized_model, dummy_input) - torch.jit.save(traced_model, "bert_traced_eager_quant.pt") - -To load the quantized model, we can use `torch.jit.load` - -.. code:: python - - loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt") - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a -well-known state-of-the-art NLP model like BERT into dynamic quantized -model. Dynamic quantization can reduce the size of the model while only -having a limited implication on accuracy. - -Thanks for reading! As always, we welcome any feedback, so please create -an issue `here `_ if you have -any. - - - -References ------------ - -[1] J.Devlin, M. Chang, K. Lee and K. Toutanova, `BERT: Pre-training of -Deep Bidirectional Transformers for Language Understanding (2018) -`_. - -[2] `HuggingFace Transformers `_. - -[3] O. Zafrir, G. Boudoukh, P. Izsak, and M. Wasserblat (2019). `Q8BERT: -Quantized 8bit BERT `_. diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py index 9199daf13a3..cb2f42df685 100644 --- a/intermediate_source/ensembling.py +++ b/intermediate_source/ensembling.py @@ -50,7 +50,7 @@ def forward(self, x): # minibatch of size 64. Furthermore, lets say we want to combine the predictions # from 10 different models. -device = 'cuda' +device = torch.accelerator.current_accelerator() num_models = 10 data = torch.randn(100, 64, 1, 28, 28, device=device) diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py deleted file mode 100644 index 8b0162a9e84..00000000000 --- a/intermediate_source/flask_rest_api_tutorial.py +++ /dev/null @@ -1,335 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Deploying PyTorch in Python via a REST API with Flask -======================================================== -**Author**: `Avinash Sajjanshetty `_ - -In this tutorial, we will deploy a PyTorch model using Flask and expose a -REST API for model inference. In particular, we will deploy a pretrained -DenseNet 121 model which detects the image. - -.. tip:: All the code used here is released under MIT license and is available on `Github `_. - -This represents the first in a series of tutorials on deploying PyTorch models -in production. Using Flask in this way is by far the easiest way to start -serving your PyTorch models, but it will not work for a use case -with high performance requirements. For that: - - - If you're already familiar with TorchScript, you can jump straight into our - `Loading a TorchScript Model in C++ `_ tutorial. - - - If you first need a refresher on TorchScript, check out our - `Intro a TorchScript `_ tutorial. -""" - - -###################################################################### -# API Definition -# -------------- -# -# We will first define our API endpoints, the request and response types. Our -# API endpoint will be at ``/predict`` which takes HTTP POST requests with a -# ``file`` parameter which contains the image. The response will be of JSON -# response containing the prediction: -# -# .. code-block:: sh -# -# {"class_id": "n02124075", "class_name": "Egyptian_cat"} -# -# - -###################################################################### -# Dependencies -# ------------ -# -# Install the required dependencies by running the following command: -# -# .. code-block:: sh -# -# pip install Flask==2.0.1 torchvision==0.10.0 - - -###################################################################### -# Simple Web Server -# ----------------- -# -# Following is a simple web server, taken from Flask's documentation - - -from flask import Flask -app = Flask(__name__) - - -@app.route('/') -def hello(): - return 'Hello World!' - -############################################################################### -# We will also change the response type, so that it returns a JSON response -# containing ImageNet class id and name. The updated ``app.py`` file will -# be now: - -from flask import Flask, jsonify -app = Flask(__name__) - -@app.route('/predict', methods=['POST']) -def predict(): - return jsonify({'class_id': 'IMAGE_NET_XXX', 'class_name': 'Cat'}) - - -###################################################################### -# Inference -# ----------------- -# -# In the next sections we will focus on writing the inference code. This will -# involve two parts, one where we prepare the image so that it can be fed -# to DenseNet and next, we will write the code to get the actual prediction -# from the model. -# -# Preparing the image -# ~~~~~~~~~~~~~~~~~~~ -# -# DenseNet model requires the image to be of 3 channel RGB image of size -# 224 x 224. We will also normalize the image tensor with the required mean -# and standard deviation values. You can read more about it -# `here `_. -# -# We will use ``transforms`` from ``torchvision`` library and build a -# transform pipeline, which transforms our images as required. You -# can read more about transforms `here `_. - -import io - -import torchvision.transforms as transforms -from PIL import Image - -def transform_image(image_bytes): - my_transforms = transforms.Compose([transforms.Resize(255), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - [0.485, 0.456, 0.406], - [0.229, 0.224, 0.225])]) - image = Image.open(io.BytesIO(image_bytes)) - return my_transforms(image).unsqueeze(0) - -###################################################################### -# The above method takes image data in bytes, applies the series of transforms -# and returns a tensor. To test the above method, read an image file in -# bytes mode (first replacing `../_static/img/sample_file.jpeg` with the actual -# path to the file on your computer) and see if you get a tensor back: - -with open("../_static/img/sample_file.jpeg", 'rb') as f: - image_bytes = f.read() - tensor = transform_image(image_bytes=image_bytes) - print(tensor) - -###################################################################### -# Prediction -# ~~~~~~~~~~~~~~~~~~~ -# -# Now will use a pretrained DenseNet 121 model to predict the image class. We -# will use one from ``torchvision`` library, load the model and get an -# inference. While we'll be using a pretrained model in this example, you can -# use this same approach for your own models. See more about loading your -# models in this :doc:`tutorial `. - -from torchvision import models - -# Make sure to set `weights` as `'IMAGENET1K_V1'` to use the pretrained weights: -model = models.densenet121(weights='IMAGENET1K_V1') -# Since we are using our model only for inference, switch to `eval` mode: -model.eval() - - -def get_prediction(image_bytes): - tensor = transform_image(image_bytes=image_bytes) - outputs = model.forward(tensor) - _, y_hat = outputs.max(1) - return y_hat - -###################################################################### -# The tensor ``y_hat`` will contain the index of the predicted class id. -# However, we need a human readable class name. For that we need a class id -# to name mapping. Download -# `this file `_ -# as ``imagenet_class_index.json`` and remember where you saved it (or, if you -# are following the exact steps in this tutorial, save it in -# `tutorials/_static`). This file contains the mapping of ImageNet class id to -# ImageNet class name. We will load this JSON file and get the class name of -# the predicted index. - -import json - -imagenet_class_index = json.load(open('../_static/imagenet_class_index.json')) - -def get_prediction(image_bytes): - tensor = transform_image(image_bytes=image_bytes) - outputs = model.forward(tensor) - _, y_hat = outputs.max(1) - predicted_idx = str(y_hat.item()) - return imagenet_class_index[predicted_idx] - - -###################################################################### -# Before using ``imagenet_class_index`` dictionary, first we will convert -# tensor value to a string value, since the keys in the -# ``imagenet_class_index`` dictionary are strings. -# We will test our above method: - - -with open("../_static/img/sample_file.jpeg", 'rb') as f: - image_bytes = f.read() - print(get_prediction(image_bytes=image_bytes)) - -###################################################################### -# You should get a response like this: - -['n02124075', 'Egyptian_cat'] - -###################################################################### -# The first item in array is ImageNet class id and second item is the human -# readable name. -# - -###################################################################### -# Integrating the model in our API Server -# --------------------------------------- -# -# In this final part we will add our model to our Flask API server. Since -# our API server is supposed to take an image file, we will update our ``predict`` -# method to read files from the requests: -# -# .. code-block:: python -# -# from flask import request -# -# @app.route('/predict', methods=['POST']) -# def predict(): -# if request.method == 'POST': -# # we will get the file from the request -# file = request.files['file'] -# # convert that to bytes -# img_bytes = file.read() -# class_id, class_name = get_prediction(image_bytes=img_bytes) -# return jsonify({'class_id': class_id, 'class_name': class_name}) -# -# -###################################################################### -# The ``app.py`` file is now complete. Following is the full version; replace -# the paths with the paths where you saved your files and it should run: -# -# .. code-block:: python -# -# import io -# import json -# -# from torchvision import models -# import torchvision.transforms as transforms -# from PIL import Image -# from flask import Flask, jsonify, request -# -# -# app = Flask(__name__) -# imagenet_class_index = json.load(open('/imagenet_class_index.json')) -# model = models.densenet121(weights='IMAGENET1K_V1') -# model.eval() -# -# -# def transform_image(image_bytes): -# my_transforms = transforms.Compose([transforms.Resize(255), -# transforms.CenterCrop(224), -# transforms.ToTensor(), -# transforms.Normalize( -# [0.485, 0.456, 0.406], -# [0.229, 0.224, 0.225])]) -# image = Image.open(io.BytesIO(image_bytes)) -# return my_transforms(image).unsqueeze(0) -# -# -# def get_prediction(image_bytes): -# tensor = transform_image(image_bytes=image_bytes) -# outputs = model.forward(tensor) -# _, y_hat = outputs.max(1) -# predicted_idx = str(y_hat.item()) -# return imagenet_class_index[predicted_idx] -# -# -# @app.route('/predict', methods=['POST']) -# def predict(): -# if request.method == 'POST': -# file = request.files['file'] -# img_bytes = file.read() -# class_id, class_name = get_prediction(image_bytes=img_bytes) -# return jsonify({'class_id': class_id, 'class_name': class_name}) -# -# -# if __name__ == '__main__': -# app.run() -# -# -###################################################################### -# Let's test our web server! Run: -# -# .. code-block:: sh -# -# FLASK_ENV=development FLASK_APP=app.py flask run -# -####################################################################### -# We can use the -# `requests `_ -# library to send a POST request to our app: -# -# .. code-block:: python -# -# import requests -# -# resp = requests.post("http://localhost:5000/predict", -# files={"file": open('/cat.jpg','rb')}) -# - -####################################################################### -# Printing `resp.json()` will now show the following: -# -# .. code-block:: sh -# -# {"class_id": "n02124075", "class_name": "Egyptian_cat"} -# -###################################################################### -# Next steps -# -------------- -# -# The server we wrote is quite trivial and may not do everything -# you need for your production application. So, here are some things you -# can do to make it better: -# -# - The endpoint ``/predict`` assumes that always there will be a image file -# in the request. This may not hold true for all requests. Our user may -# send image with a different parameter or send no images at all. -# -# - The user may send non-image type files too. Since we are not handling -# errors, this will break our server. Adding an explicit error handing -# path that will throw an exception would allow us to better handle -# the bad inputs -# -# - Even though the model can recognize a large number of classes of images, -# it may not be able to recognize all images. Enhance the implementation -# to handle cases when the model does not recognize anything in the image. -# -# - We run the Flask server in the development mode, which is not suitable for -# deploying in production. You can check out `this tutorial `_ -# for deploying a Flask server in production. -# -# - You can also add a UI by creating a page with a form which takes the image and -# displays the prediction. Check out the `demo `_ -# of a similar project and its `source code `_. -# -# - In this tutorial, we only showed how to build a service that could return predictions for -# a single image at a time. We could modify our service to be able to return predictions for -# multiple images at once. In addition, the `service-streamer `_ -# library automatically queues requests to your service and samples them into mini-batches -# that can be fed into your model. You can check out `this tutorial `_. -# -# - Finally, we encourage you to check out our other tutorials on deploying PyTorch models -# linked-to at the top of the page. -# diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py deleted file mode 100644 index 547f93fb7f1..00000000000 --- a/intermediate_source/fx_conv_bn_fuser.py +++ /dev/null @@ -1,262 +0,0 @@ -# -*- coding: utf-8 -*- -""" -(beta) Building a Convolution/Batch Norm fuser in FX -******************************************************* -**Author**: `Horace He `_ - -In this tutorial, we are going to use FX, a toolkit for composable function -transformations of PyTorch, to do the following: - -1) Find patterns of conv/batch norm in the data dependencies. -2) For the patterns found in 1), fold the batch norm statistics into the convolution weights. - -Note that this optimization only works for models in inference mode (i.e. `mode.eval()`) - -We will be building the fuser that exists here: -https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/fx/experimental/fuser.py - -""" - - -###################################################################### -# First, let's get some imports out of the way (we will be using all -# of these later in the code). - -from typing import Type, Dict, Any, Tuple, Iterable -import copy -import torch.fx as fx -import torch -import torch.nn as nn - -###################################################################### -# For this tutorial, we are going to create a model consisting of convolutions -# and batch norms. Note that this model has some tricky components - some of -# the conv/batch norm patterns are hidden within Sequentials and one of the -# ``BatchNorms`` is wrapped in another Module. - -class WrappedBatchNorm(nn.Module): - def __init__(self): - super().__init__() - self.mod = nn.BatchNorm2d(1) - def forward(self, x): - return self.mod(x) - -class M(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 1, 1) - self.bn1 = nn.BatchNorm2d(1) - self.conv2 = nn.Conv2d(1, 1, 1) - self.nested = nn.Sequential( - nn.BatchNorm2d(1), - nn.Conv2d(1, 1, 1), - ) - self.wrapped = WrappedBatchNorm() - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.conv2(x) - x = self.nested(x) - x = self.wrapped(x) - return x - -model = M() - -model.eval() - -###################################################################### -# Fusing Convolution with Batch Norm -# ----------------------------------------- -# One of the primary challenges with trying to automatically fuse convolution -# and batch norm in PyTorch is that PyTorch does not provide an easy way of -# accessing the computational graph. FX resolves this problem by symbolically -# tracing the actual operations called, so that we can track the computations -# through the `forward` call, nested within Sequential modules, or wrapped in -# an user-defined module. - -traced_model = torch.fx.symbolic_trace(model) -print(traced_model.graph) - -###################################################################### -# This gives us a graph representation of our model. Note that both the modules -# hidden within the sequential as well as the wrapped Module have been inlined -# into the graph. This is the default level of abstraction, but it can be -# configured by the pass writer. More information can be found at the FX -# overview https://pytorch.org/docs/master/fx.html#module-torch.fx - - -#################################### -# Fusing Convolution with Batch Norm -# ---------------------------------- -# Unlike some other fusions, fusion of convolution with batch norm does not -# require any new operators. Instead, as batch norm during inference -# consists of a pointwise add and multiply, these operations can be "baked" -# into the preceding convolution's weights. This allows us to remove the batch -# norm entirely from our model! Read -# https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ for further details. The -# code here is copied from -# https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/nn/utils/fusion.py -# clarity purposes. -def fuse_conv_bn_eval(conv, bn): - """ - Given a conv Module `A` and an batch_norm module `B`, returns a conv - module `C` such that C(x) == B(A(x)) in inference mode. - """ - assert(not (conv.training or bn.training)), "Fusion only for eval!" - fused_conv = copy.deepcopy(conv) - - fused_conv.weight, fused_conv.bias = \ - fuse_conv_bn_weights(fused_conv.weight, fused_conv.bias, - bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) - - return fused_conv - -def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): - if conv_b is None: - conv_b = torch.zeros_like(bn_rm) - if bn_w is None: - bn_w = torch.ones_like(bn_rm) - if bn_b is None: - bn_b = torch.zeros_like(bn_rm) - bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps) - - conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) - conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b - - return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b) - - -#################################### -# FX Fusion Pass -# ---------------------------------- -# Now that we have our computational graph as well as a method for fusing -# convolution and batch norm, all that remains is to iterate over the FX graph -# and apply the desired fusions. - - -def _parent_name(target : str) -> Tuple[str, str]: - """ - Splits a ``qualname`` into parent path and last atom. - For example, `foo.bar.baz` -> (`foo.bar`, `baz`) - """ - *parent, name = target.rsplit('.', 1) - return parent[0] if parent else '', name - -def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torch.nn.Module): - assert(isinstance(node.target, str)) - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, new_module) - - -def fuse(model: torch.nn.Module) -> torch.nn.Module: - model = copy.deepcopy(model) - # The first step of most FX passes is to symbolically trace our model to - # obtain a `GraphModule`. This is a representation of our original model - # that is functionally identical to our original model, except that we now - # also have a graph representation of our forward pass. - fx_model: fx.GraphModule = fx.symbolic_trace(model) - modules = dict(fx_model.named_modules()) - - # The primary representation for working with FX are the `Graph` and the - # `Node`. Each `GraphModule` has a `Graph` associated with it - this - # `Graph` is also what generates `GraphModule.code`. - # The `Graph` itself is represented as a list of `Node` objects. Thus, to - # iterate through all of the operations in our graph, we iterate over each - # `Node` in our `Graph`. - for node in fx_model.graph.nodes: - # The FX IR contains several types of nodes, which generally represent - # call sites to modules, functions, or methods. The type of node is - # determined by `Node.op`. - if node.op != 'call_module': # If our current node isn't calling a Module then we can ignore it. - continue - # For call sites, `Node.target` represents the module/function/method - # that's being called. Here, we check `Node.target` to see if it's a - # batch norm module, and then check `Node.args[0].target` to see if the - # input `Node` is a convolution. - if type(modules[node.target]) is nn.BatchNorm2d and type(modules[node.args[0].target]) is nn.Conv2d: - if len(node.args[0].users) > 1: # Output of conv is used by other nodes - continue - conv = modules[node.args[0].target] - bn = modules[node.target] - fused_conv = fuse_conv_bn_eval(conv, bn) - replace_node_module(node.args[0], modules, fused_conv) - # As we've folded the batch nor into the conv, we need to replace all uses - # of the batch norm with the conv. - node.replace_all_uses_with(node.args[0]) - # Now that all uses of the batch norm have been replaced, we can - # safely remove the batch norm. - fx_model.graph.erase_node(node) - fx_model.graph.lint() - # After we've modified our graph, we need to recompile our graph in order - # to keep the generated code in sync. - fx_model.recompile() - return fx_model - - -###################################################################### -# .. note:: -# We make some simplifications here for demonstration purposes, such as only -# matching 2D convolutions. View -# https://github.com/pytorch/pytorch/blob/master/torch/fx/experimental/fuser.py -# for a more usable pass. - -###################################################################### -# Testing out our Fusion Pass -# ----------------------------------------- -# We can now run this fusion pass on our initial toy model and verify that our -# results are identical. In addition, we can print out the code for our fused -# model and verify that there are no more batch norms. - - -fused_model = fuse(model) -print(fused_model.code) -inp = torch.randn(5, 1, 1, 1) -torch.testing.assert_allclose(fused_model(inp), model(inp)) - - -###################################################################### -# Benchmarking our Fusion on ResNet18 -# ----------------------------------- -# We can test our fusion pass on a larger model like ResNet18 and see how much -# this pass improves inference performance. -import torchvision.models as models -import time - -rn18 = models.resnet18() -rn18.eval() - -inp = torch.randn(10, 3, 224, 224) -output = rn18(inp) - -def benchmark(model, iters=20): - for _ in range(10): - model(inp) - begin = time.time() - for _ in range(iters): - model(inp) - return str(time.time()-begin) - -fused_rn18 = fuse(rn18) -print("Unfused time: ", benchmark(rn18)) -print("Fused time: ", benchmark(fused_rn18)) -###################################################################### -# As we previously saw, the output of our FX transformation is -# ("torchscriptable") PyTorch code, we can easily ``jit.script`` the output to try -# and increase our performance even more. In this way, our FX model -# transformation composes with TorchScript with no issues. -jit_rn18 = torch.jit.script(fused_rn18) -print("jit time: ", benchmark(jit_rn18)) - - -############ -# Conclusion -# ---------- -# As we can see, using FX we can easily write static graph transformations on -# PyTorch code. -# -# Since FX is still in beta, we would be happy to hear any -# feedback you have about using it. Please feel free to use the -# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker -# (https://github.com/pytorch/pytorch/issues) to provide any feedback -# you might have. diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py index 8caaf7be39b..7f31338d002 100644 --- a/intermediate_source/fx_profiling_tutorial.py +++ b/intermediate_source/fx_profiling_tutorial.py @@ -216,9 +216,6 @@ def summary(self, should_sort : bool = False) -> str: # # * ``MaxPool2d`` takes up the most time. This is a known issue: # https://github.com/pytorch/pytorch/issues/51393 -# * BatchNorm2d also takes up significant time. We can continue this -# line of thinking and optimize this in the Conv-BN Fusion with FX -# `tutorial `_. # # # Conclusion diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py index 370180d968d..4b6d62c0b0d 100644 --- a/intermediate_source/inductor_debug_cpu.py +++ b/intermediate_source/inductor_debug_cpu.py @@ -19,8 +19,8 @@ # # Meanwhile, you may also find related tutorials about ``torch.compile`` # around `basic usage `_, -# comprehensive `troubleshooting `_ -# and GPU-specific knowledge like `GPU performance profiling `_. +# comprehensive `troubleshooting `_ +# and GPU-specific knowledge like `GPU performance profiling `_. # # We will start debugging with a motivating example that triggers compilation issues and accuracy problems # by demonstrating the process of debugging to pinpoint the problems. @@ -110,7 +110,8 @@ def forward1(self, arg0_1, arg1_1): # C++ kernel in ``output_code``: # -from torch._inductor.codecache import AsyncCompile +import torch +from torch._inductor.async_compile import AsyncCompile async_compile = AsyncCompile() cpp_fused_cat_maximum_neg_0 = async_compile.cpp(''' @@ -342,7 +343,7 @@ def forward2(self, arg0_1): return (neg,) ###################################################################### -# For more usage details about Minifier, please refer to `Troubleshooting `_. +# For more usage details about Minifier, please refer to `Troubleshooting `_. ###################################################################### diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py index 26bc5c9d53c..b3e3c04cfe2 100644 --- a/intermediate_source/memory_format_tutorial.py +++ b/intermediate_source/memory_format_tutorial.py @@ -1,13 +1,28 @@ # -*- coding: utf-8 -*- """ -(beta) Channels Last Memory Format in PyTorch +Channels Last Memory Format in PyTorch ******************************************************* **Author**: `Vitaly Fedyunin `_ -What is Channels Last ---------------------- +.. grid:: 2 -Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * What is the channels last memory format in PyTorch? + * How can it be used to improve performance on certain operators? + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v1.5.0 + * A CUDA-capable GPU + +######################################################################### +# Overview - What is channels last? +# --------------------------------- + +The channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). For example, classic (contiguous) storage of NCHW tensor (in our case it is two 4x4 images with 3 color channels) look like this: @@ -19,7 +34,7 @@ .. figure:: /_static/img/channels_last_memory_format.png :alt: channels_last_memory_format -Pytorch supports memory formats (and provides back compatibility with existing models including eager, JIT, and TorchScript) by utilizing existing strides structure. +Pytorch supports memory formats by utilizing the existing strides structure. For example, 10x3x16x16 batch in Channels last format will have strides equal to (768, 1, 48, 3). """ @@ -387,3 +402,12 @@ def attribute(m): # # If you have feedback and/or suggestions for improvement, please let us # know by creating `an issue `_. + +###################################################################### +# Conclusion +# ---------- +# +# This tutorial introduced the "channels last" memory format and demonstrated +# how to use it for performance gains. For a practical example of accelerating +# vision models using channels last, see the post +# `here `_. diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py deleted file mode 100644 index 562064614b9..00000000000 --- a/intermediate_source/model_parallel_tutorial.py +++ /dev/null @@ -1,357 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Single-Machine Model Parallel Best Practices -============================================ -**Author**: `Shen Li `_ - -Model parallel is widely-used in distributed training -techniques. Previous posts have explained how to use -`DataParallel `_ -to train a neural network on multiple GPUs; this feature replicates the -same model to all GPUs, where each GPU consumes a different partition of the -input data. Although it can significantly accelerate the training process, it -does not work for some use cases where the model is too large to fit into a -single GPU. This post shows how to solve that problem by using **model parallel**, -which, in contrast to ``DataParallel``, splits a single model onto different GPUs, -rather than replicating the entire model on each GPU (to be concrete, say a model -``m`` contains 10 layers: when using ``DataParallel``, each GPU will have a -replica of each of these 10 layers, whereas when using model parallel on two GPUs, -each GPU could host 5 layers). - -The high-level idea of model parallel is to place different sub-networks of a -model onto different devices, and implement the ``forward`` method accordingly -to move intermediate outputs across devices. As only part of a model operates -on any individual device, a set of devices can collectively serve a larger -model. In this post, we will not try to construct huge models and squeeze them -into a limited number of GPUs. Instead, this post focuses on showing the idea -of model parallel. It is up to the readers to apply the ideas to real-world -applications. - -.. note:: - - For distributed model parallel training where a model spans multiple - servers, please refer to - `Getting Started With Distributed RPC Framework `__ - for examples and details. - -Basic Usage ------------ -""" - -###################################################################### -# Let us start with a toy model that contains two linear layers. To run this -# model on two GPUs, simply put each linear layer on a different GPU, and move -# inputs and intermediate outputs to match the layer devices accordingly. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -class ToyModel(nn.Module): - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = torch.nn.Linear(10, 10).to('cuda:0') - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(10, 5).to('cuda:1') - - def forward(self, x): - x = self.relu(self.net1(x.to('cuda:0'))) - return self.net2(x.to('cuda:1')) - -###################################################################### -# Note that, the above ``ToyModel`` looks very similar to how one would -# implement it on a single GPU, except the four ``to(device)`` calls which -# place linear layers and tensors on proper devices. That is the only place in -# the model that requires changes. The ``backward()`` and ``torch.optim`` will -# automatically take care of gradients as if the model is on one GPU. You only -# need to make sure that the labels are on the same device as the outputs when -# calling the loss function. - - -model = ToyModel() -loss_fn = nn.MSELoss() -optimizer = optim.SGD(model.parameters(), lr=0.001) - -optimizer.zero_grad() -outputs = model(torch.randn(20, 10)) -labels = torch.randn(20, 5).to('cuda:1') -loss_fn(outputs, labels).backward() -optimizer.step() - -###################################################################### -# Apply Model Parallel to Existing Modules -# ---------------------------------------- -# -# It is also possible to run an existing single-GPU module on multiple GPUs -# with just a few lines of changes. The code below shows how to decompose -# ``torchvision.models.resnet50()`` to two GPUs. The idea is to inherit from -# the existing ``ResNet`` module, and split the layers to two GPUs during -# construction. Then, override the ``forward`` method to stitch two -# sub-networks by moving the intermediate outputs accordingly. - - -from torchvision.models.resnet import ResNet, Bottleneck - -num_classes = 1000 - - -class ModelParallelResNet50(ResNet): - def __init__(self, *args, **kwargs): - super(ModelParallelResNet50, self).__init__( - Bottleneck, [3, 4, 6, 3], num_classes=num_classes, *args, **kwargs) - - self.seq1 = nn.Sequential( - self.conv1, - self.bn1, - self.relu, - self.maxpool, - - self.layer1, - self.layer2 - ).to('cuda:0') - - self.seq2 = nn.Sequential( - self.layer3, - self.layer4, - self.avgpool, - ).to('cuda:1') - - self.fc.to('cuda:1') - - def forward(self, x): - x = self.seq2(self.seq1(x).to('cuda:1')) - return self.fc(x.view(x.size(0), -1)) - - -###################################################################### -# The above implementation solves the problem for cases where the model is too -# large to fit into a single GPU. However, you might have already noticed that -# it will be slower than running it on a single GPU if your model fits. It is -# because, at any point in time, only one of the two GPUs are working, while -# the other one is sitting there doing nothing. The performance further -# deteriorates as the intermediate outputs need to be copied from ``cuda:0`` to -# ``cuda:1`` between ``layer2`` and ``layer3``. -# -# Let us run an experiment to get a more quantitative view of the execution -# time. In this experiment, we train ``ModelParallelResNet50`` and the existing -# ``torchvision.models.resnet50()`` by running random inputs and labels through -# them. After the training, the models will not produce any useful predictions, -# but we can get a reasonable understanding of the execution times. - - -import torchvision.models as models - -num_batches = 3 -batch_size = 120 -image_w = 128 -image_h = 128 - - -def train(model): - model.train(True) - loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.001) - - one_hot_indices = torch.LongTensor(batch_size) \ - .random_(0, num_classes) \ - .view(batch_size, 1) - - for _ in range(num_batches): - # generate random inputs and labels - inputs = torch.randn(batch_size, 3, image_w, image_h) - labels = torch.zeros(batch_size, num_classes) \ - .scatter_(1, one_hot_indices, 1) - - # run forward pass - optimizer.zero_grad() - outputs = model(inputs.to('cuda:0')) - - # run backward pass - labels = labels.to(outputs.device) - loss_fn(outputs, labels).backward() - optimizer.step() - - -###################################################################### -# The ``train(model)`` method above uses ``nn.MSELoss`` as the loss function, -# and ``optim.SGD`` as the optimizer. It mimics training on ``128 X 128`` -# images which are organized into 3 batches where each batch contains 120 -# images. Then, we use ``timeit`` to run the ``train(model)`` method 10 times -# and plot the execution times with standard deviations. - - -import matplotlib.pyplot as plt -plt.switch_backend('Agg') -import numpy as np -import timeit - -num_repeat = 10 - -stmt = "train(model)" - -setup = "model = ModelParallelResNet50()" -mp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -mp_mean, mp_std = np.mean(mp_run_times), np.std(mp_run_times) - -setup = "import torchvision.models as models;" + \ - "model = models.resnet50(num_classes=num_classes).to('cuda:0')" -rn_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -rn_mean, rn_std = np.mean(rn_run_times), np.std(rn_run_times) - - -def plot(means, stds, labels, fig_name): - fig, ax = plt.subplots() - ax.bar(np.arange(len(means)), means, yerr=stds, - align='center', alpha=0.5, ecolor='red', capsize=10, width=0.6) - ax.set_ylabel('ResNet50 Execution Time (Second)') - ax.set_xticks(np.arange(len(means))) - ax.set_xticklabels(labels) - ax.yaxis.grid(True) - plt.tight_layout() - plt.savefig(fig_name) - plt.close(fig) - - -plot([mp_mean, rn_mean], - [mp_std, rn_std], - ['Model Parallel', 'Single GPU'], - 'mp_vs_rn.png') - - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn.png -# :alt: -# -# The result shows that the execution time of model parallel implementation is -# ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. So we -# can conclude there is roughly 7% overhead in copying tensors back and forth -# across the GPUs. There are rooms for improvements, as we know one of the two -# GPUs is sitting idle throughout the execution. One option is to further -# divide each batch into a pipeline of splits, such that when one split reaches -# the second sub-network, the following split can be fed into the first -# sub-network. In this way, two consecutive splits can run concurrently on two -# GPUs. - -###################################################################### -# Speed Up by Pipelining Inputs -# ----------------------------- -# -# In the following experiments, we further divide each 120-image batch into -# 20-image splits. As PyTorch launches CUDA operations asynchronously, the -# implementation does not need to spawn multiple threads to achieve -# concurrency. - - -class PipelineParallelResNet50(ModelParallelResNet50): - def __init__(self, split_size=20, *args, **kwargs): - super(PipelineParallelResNet50, self).__init__(*args, **kwargs) - self.split_size = split_size - - def forward(self, x): - splits = iter(x.split(self.split_size, dim=0)) - s_next = next(splits) - s_prev = self.seq1(s_next).to('cuda:1') - ret = [] - - for s_next in splits: - # A. ``s_prev`` runs on ``cuda:1`` - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - # B. ``s_next`` runs on ``cuda:0``, which can run concurrently with A - s_prev = self.seq1(s_next).to('cuda:1') - - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - return torch.cat(ret) - - -setup = "model = PipelineParallelResNet50()" -pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -pp_mean, pp_std = np.mean(pp_run_times), np.std(pp_run_times) - -plot([mp_mean, rn_mean, pp_mean], - [mp_std, rn_std, pp_std], - ['Model Parallel', 'Single GPU', 'Pipelining Model Parallel'], - 'mp_vs_rn_vs_pp.png') - -###################################################################### -# Please note, device-to-device tensor copy operations are synchronized on -# current streams on the source and the destination devices. If you create -# multiple streams, you have to make sure that copy operations are properly -# synchronized. Writing the source tensor or reading/writing the destination -# tensor before finishing the copy operation can lead to undefined behavior. -# The above implementation only uses default streams on both source and -# destination devices, hence it is not necessary to enforce additional -# synchronizations. -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn_vs_pp.png -# :alt: -# -# The experiment result shows that, pipelining inputs to model parallel -# ResNet50 speeds up the training process by roughly ``3.75/2.51-1=49%``. It is -# still quite far away from the ideal 100% speedup. As we have introduced a new -# parameter ``split_sizes`` in our pipeline parallel implementation, it is -# unclear how the new parameter affects the overall training time. Intuitively -# speaking, using small ``split_size`` leads to many tiny CUDA kernel launch, -# while using large ``split_size`` results to relatively long idle times during -# the first and last splits. Neither are optimal. There might be an optimal -# ``split_size`` configuration for this specific experiment. Let us try to find -# it by running experiments using several different ``split_size`` values. - - -means = [] -stds = [] -split_sizes = [1, 3, 5, 8, 10, 12, 20, 40, 60] - -for split_size in split_sizes: - setup = "model = PipelineParallelResNet50(split_size=%d)" % split_size - pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) - means.append(np.mean(pp_run_times)) - stds.append(np.std(pp_run_times)) - -fig, ax = plt.subplots() -ax.plot(split_sizes, means) -ax.errorbar(split_sizes, means, yerr=stds, ecolor='red', fmt='ro') -ax.set_ylabel('ResNet50 Execution Time (Second)') -ax.set_xlabel('Pipeline Split Size') -ax.set_xticks(split_sizes) -ax.yaxis.grid(True) -plt.tight_layout() -plt.savefig("split_size_tradeoff.png") -plt.close(fig) - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/split_size_tradeoff.png -# :alt: -# -# The result shows that setting ``split_size`` to 12 achieves the fastest -# training speed, which leads to ``3.75/2.43-1=54%`` speedup. There are -# still opportunities to further accelerate the training process. For example, -# all operations on ``cuda:0`` is placed on its default stream. It means that -# computations on the next split cannot overlap with the copy operation of the -# ``prev`` split. However, as ``prev`` and next splits are different tensors, there is -# no problem to overlap one's computation with the other one's copy. The -# implementation need to use multiple streams on both GPUs, and different -# sub-network structures require different stream management strategies. As no -# general multi-stream solution works for all model parallel use cases, we will -# not discuss it in this tutorial. -# -# **Note:** -# -# This post shows several performance measurements. You might see different -# numbers when running the same code on your own machine, because the result -# depends on the underlying hardware and software. To get the best performance -# for your environment, a proper approach is to first generate the curve to -# figure out the best split size, and then use that split size to pipeline -# inputs. -# diff --git a/intermediate_source/model_parallel_tutorial.rst b/intermediate_source/model_parallel_tutorial.rst new file mode 100644 index 00000000000..d687caf4634 --- /dev/null +++ b/intermediate_source/model_parallel_tutorial.rst @@ -0,0 +1,10 @@ +Single-Machine Model Parallel Best Practices +============================================ + +This tutorial has been deprecated. + +Redirecting to latest parallelism APIs in 3 seconds... + +.. raw:: html + + diff --git a/intermediate_source/monarch_distributed_tutorial.rst b/intermediate_source/monarch_distributed_tutorial.rst new file mode 100644 index 00000000000..33909d2baad --- /dev/null +++ b/intermediate_source/monarch_distributed_tutorial.rst @@ -0,0 +1,449 @@ +========================================================== +Interactive Distributed Applications with Monarch +========================================================== + +**Author**: `Amir Afzali `_ + +Introduction +------------ + +As deep learning models continue to grow in size and complexity, training them efficiently requires coordinating computation across multiple GPUs and nodes. +In this tutorial, you will learn how to easily set up and run large-scale distributed workflows using Monarch's actor framework together with TorchTitan, on a SLURM-managed cluster. +Monarch will allow us to drive a large cluster of machines (organized into a mesh), as if developing on a single host, single process environment. + +What is Monarch? +^^^^^^^^^^^^^^^^ + +Monarch is an actor framework designed to streamline the development of distributed applications. At its core, Monarch provides: + +- **Actor-based programming model**: Encapsulate stateful computations in actors that can run on remote processes and machines +- **Process mesh abstractions**: Easily manage and coordinate distributed processes across your cluster, with scalable Actor messaging +- **Fault tolerance**: Actors and processes form a tree and failures propagate up the tree, providing good default error behavior and enabling fine-grained fault recovery. +- **Flexible resource management**: Support for multiple cluster schedulers including SLURM, Kubernetes, custom host management, and local processes +- **Integrated monitoring**: Stream logs from remote processes back to your client for easy debugging and aggregation + +For more details, see the `Monarch documentation `_. + +Why Use Monarch? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TorchTitan is a PyTorch native library for pre-training at scale. +While TorchTitan provides excellent primitives for distributed training, launching and managing these jobs across clusters can slow down iteration. Monarch addresses this with: + +1. **Simplified cluster interaction**: Reserve and manage compute resources with simple async Python calls instead of writing bash scripts +2. **Interactive development**: Modify and re-run training code on existing allocations without waiting for new resources +3. **Unified workflow**: Seamlessly move between local testing and cluster execution with the same code + +Prerequisites +------------- + +We rely on a nightly build of Titan for this tutorial, so please ensure that other Torch libraries are tracking nightly builds: + +1. **Monarch nightly installed:** + `Install script `_ +2. **TorchTitan nightly installed:** + `TorchTitan install instructions `_ +3. **A valid Titan model config** and **tokenizer** in your working directory (e.g., ``debug_model.toml`` from `TorchTitan configs `_). +4. **SLURM cluster access:** + + - Sufficient permissions to reserve nodes and launch jobs. + - CUDA environment configured for distributed GPU training. + + +Now let's implement this step by step! + +Step 1: Reserve Machine Resources +--------------------------------- + +First, we'll define a function to programmatically reserve a machine allocation. + +**Monarch Highlight**: Instead of submitting an SBATCH script, you can reserve and manage resources interactively from Python. +The JobTrait design pattern allows for interfacing with custom schedulers, such as SLURM and Kubernetes, through a consistent API. + +.. code-block:: python + + from monarch.job import SlurmJob, JobTrait + + + def create_slurm_job( + mesh_name: str, + num_nodes: int, + gpus_per_node: int, + time_limit: str = "06:00:00" + ) -> SlurmJob: + """ + Args: + mesh_name: Name assigned to the primary mesh for this example. + A JobTrait can consist of multiple meshes, and + Monarch allows for re-attaching to ongoing jobs. + num_nodes: Number of nodes allocated per mesh + gpus_per_node: Number of GPUs per node in the mesh + + Note: SlurmJob is just one instance of a Monarch scheduler interface. + Consult the JobTrait documentation to find one that's right for your usecase. + """ + default_job_name = "monarch_titan" + return SlurmJob( + meshes={mesh_name: num_nodes}, + job_name=default_job_name, + time_limit=time_limit, + gpus_per_nodes=gpus_per_node, + # ... additional args can be passed here + ) + +Step 2: Define the Trainer Actor +-------------------------------- + +Now we create a Monarch Actor that wraps TorchTitan's Trainer. This is the +key abstraction that allows TorchTitan to run in Monarch's distributed +environment. + +**Monarch Highlight**: The Actor pattern provides several benefits: + +1. **Remote execution**: Methods marked with @endpoint can be called remotely +2. **Lifecycle management**: Monarch handles initialization, execution, and cleanup +3. **Error handling**: Exceptions are properly propagated back to the client, enabling progressive error handling + +.. code-block:: python + + import torch + from monarch.actor import Actor, current_rank, endpoint + from monarch.utils import setup_env_for_distributed + from torchtitan.tools.logging import init_logger, logger + from torchtitan.train import Trainer + + + class TrainerActor(Actor): + """ + Monarch Actor wrapper for TorchTitan's Trainer. + + This actor encapsulates a complete TorchTitan training process, handling + initialization, training loop execution, and cleanup. Each instance runs + on a single GPU in the distributed training job. + + The actor's lifetime: + 1. __init__: Initialize with job configuration + 2. start_training: + Execute the training loop + Destroy process group and release resources + + Attributes: + job_config: TorchTitan configuration for this trainer + uid: Unique identifier for logging (includes rank) + """ + + def __init__(self, job_config: "JobConfig") -> None: + """ + Initialize the trainer actor. + + Args: + job_config: TorchTitan JobConfig with training parameters + """ + self.job_config = job_config + + # current_rank() provides access to this actor's rank in the process mesh + self.rank = current_rank().rank + self.uid = f"[trainer_{rank}]" + + @endpoint + async def ping_rank(self) -> None: + """ + A dummy logging function we will use for demonstration purposes. + """ + logger.info(f"{self.uid} Ping!") + + @endpoint + async def start_training(self) -> None: + """ + Execute the TorchTitan training loop. + + This remote endpoint: + 1. Initializes TorchTitan's logger + 2. Creates a Trainer instance with the job configuration + 3. Runs the training loop + 4. Handles cleanup and error conditions + + The @endpoint decorator makes this method callable from the Monarch + client, even though it runs on a remote GPU node. + + Raises: + Exception: Any exception from TorchTitan training is propagated + back to the client + """ + init_logger() + trainer: Trainer | None = None + try: + # Initialize TorchTitan trainer + trainer = Trainer(self.job_config) + logger.info(f"{self.uid} initialized successfully and starting training") + + # Run the training loop + trainer.train() + + except Exception as e: + logger.error(f"{self.uid} training failed: {e}") + if trainer: + trainer.close() + # Note: error is propagated back to the controller + raise e + + else: + # Training completed successfully + trainer.close() + logger.info(f"{self.uid} training completed successfully") + + finally: + # Clean up distributed process group + torch.distributed.destroy_process_group() + logger.info(f"{self.uid} trainer cleaned up") + +Actor endpoints can be invoked in a variety of patterns. We'll explore a concrete example in `Step 4: Execute the Training Workflow`_, +but here is some pseudocode with common usages: + +.. code-block:: python + + try: + # where mesh0 is made of N nodes, each node having 8 GPUs + proc_mesh = mesh0.spawn_procs({"gpus": 8}) + trainer_actors = proc_mesh.spawn("trainers", TrainerActor, ...) + + # Call on all ranks + await trainer_actors.ping_rank.call() + + # Call-and-forget on all ranks + trainer_actors.ping_rank.broadcast() + + # Call on ONE random rank + await trainer_actors.ping_rank.choose() + + # Call on the first 3 ranks of node 0 + await trainer_actors.slice(hosts=0, gpus=slice(0, 3)).ping_rank.call() + + except Exception as e: + # handle SupervisionEvents from remote actor failures + pass + +Remote actor endpoints can also utilize Python native breakpoints, enabling interactive debugging sessions. +For a complete deep-dive into Monarch debuggers, please `refer to the documentation `_. + +.. code-block:: python + + @endpoint + async def ping_debuggable_rank(self) -> None: + logger.info(f"{self.uid} Ping!") + if self.rank == 0: + breakpoint() + logger.info(f"{self.uid} Pong!") + + +Step 3: Define Training Parameters +----------------------------------- + +Next, we define some common parameters for our training job and cluster resources. +This configuration determines both the scale of training (number of nodes and GPUs), +and some of the training hyperparameters. + +.. code-block:: python + + from dataclasses import dataclass + + + @dataclass + class RunParams: + """ + Configuration for cluster resources and training parameters. + + Attributes: + training_steps: Number of training iterations to run + model_config: Path to TorchTitan model configuration file + tokenizer: Path to tokenizer directory + dataset: Dataset to use for training (e.g., 'c4', 'c4_test') + num_nodes: Number of compute nodes to request + gpus_per_node: Number of GPUs per node + + Adjust these values based on your model size and available resources. + """ + + training_steps: int = 50 + model_config: str = "debug_model.toml" + tokenizer: str = "tokenizer" + dataset: str = "c4" + num_nodes: int = 2 + gpus_per_node: int = 8 + +TorchTitan uses a JobConfig object to control all aspects of training. +Here we create a function that parses this configuration from our RunParams. + +.. code-block:: python + + import os + from torchtitan.config import ConfigManager, JobConfig + + + def make_job_config() -> JobConfig: + """ + Create a TorchTitan JobConfig from RunParams. + + This function constructs the complete training configuration, including + parallelism settings, model architecture, and dataset paths + """ + # Calculate total parallelism based on cluster size + data_parallel_shard_degree = RunParams.num_nodes * RunParams.gpus_per_node + output_path = "./outputs" + # Construct paths relative to script directory + script_dir = os.getcwd() + + # Build argument list for TorchTitan's ConfigManager + # These override defaults from the model config file + default_args = [ + "--job.config_file", + os.path.join(script_dir, RunParams.model_config), + "--model.tokenizer_path", + os.path.join(script_dir, RunParams.tokenizer), + "--parallelism.data_parallel_shard_degree", + str(data_parallel_shard_degree), + "--training.steps", + str(RunParams.training_steps), + "--training.dataset", + RunParams.dataset, + "--job.dump_folder", + output_path, + # continue to configure as needed + ] + config_manager = ConfigManager() + job_config = config_manager.parse_args(default_args) + return job_config + +Step 4: Execute the Training Workflow +-------------------------------------- + +With all components defined, we now orchestrate the complete workflow. +This is where Monarch's power becomes most apparent. + +**Monarch Highlights**: + +1. **Interactive iteration**: After reserving the machine allocation, you can adjust your logic + and re-spawn actors, without requesting new resources. SLURM's shared filesystem ensures + that framework/workspace changes are synchronized across workers. +2. **Transparent logging**: All logs from remote workers stream back to your + client in real-time, making debugging feel like local execution + +**Workflow**: + + Reserve Machines → Create Proc Mesh → Configure Logging → Spawn Actors → Train → Cleanup + +.. code-block:: python + + async def execute_training() -> None: + """ + Execute the complete distributed training workflow. + """ + job_config = make_job_config() + slurm_job = None + mesh_name = "mesh0" + try: + # 1. Create a SLURM job with N nodes + # This leverages Monarch to reserve a persistent machine allocation + slurm_job = create_slurm_job(mesh_name, RunParams.num_nodes, RunParams.gpus_per_node) + job_state = slurm_job.state() + + # 2. Create a process mesh on the machine allocation + # This creates one process per GPU across all allocated nodes + logger.info("Creating process mesh...") + proc_mesh = job_state.mesh0.spawn_procs({"gpus": RunParams.gpus_per_node}) + + # 3. Configure remote logging behavior + # - stream_to_client: Forward all remote logs to your local console + # - aggregate_window_sec: Batch logs for efficiency + logger.info("Configuring logging...") + await proc_mesh.logging_option( + stream_to_client=True, + # aggregate_window_sec=None # Uncomment to disable log batching + ) + + # 4. Setup environment for torch.distributed + # This configures torch.distributed across all processes in the mesh + logger.info("Setting up distributed environment...") + await setup_env_for_distributed(proc_mesh) + + # 5. Spawn TrainerActor on each GPU + # Each process in the mesh creates its own TrainerActor instance + logger.info("Spawning trainer actors...") + trainer = proc_mesh.spawn( + "trainer_actor", # Name for the actor group + TrainerActor, # Actor class to instantiate + job_config, # Arguments to __init__ + ) + + # 6. Execute the training job across all actors + # The .call() method invokes start_training() on all actors in parallel + logger.info("Starting distributed training...") + await trainer.start_training.call() + + logger.info("Training completed successfully!") + + except Exception as e: + logger.error(f"Training workflow failed: {e}") + + finally: + # Always clean up the machine allocation + if slurm_job: + await cleanup_job(slurm_job) + +Step 5: Clean Up Resources +-------------------------- + +After training completes (or if you're done experimenting), it's important +to free up cluster resources by terminating the SLURM job. + +**Monarch Highlight**: While you can keep allocations alive for multiple +training runs during development, always remember to release cluster resources. + +.. code-block:: python + + async def cleanup_job(job: JobTrait) -> None: + """ + This function cancels the SLURM job, releasing all reserved nodes back + to the cluster for other users. + + Args: + job: A JobTrait, like the one returned from create_slurm_job() + + Note: + The job will also terminate automatically when the configured TTL + is exceeded, but explicit cleanup is recommended for long-running + notebooks or scripts. + """ + job.kill() + logger.info("Job terminated successfully") + +Step 6: Run the Complete Pipeline +--------------------------------- + +Finally, we tie everything together in a main function that kicks off the workflow + +.. code-block:: python + + import asyncio + + + if __name__ == "__main__": + """ + Run the complete workflow: reserve resources, train, and cleanup. + """ + logger.info("Starting Monarch + TorchTitan Distributed Training") + + asyncio.run(execute_training()) + + logger.info("Workflow completed!") + +Conclusion +----------- + +Congrats! In this tutorial, you learned how to apply Monarch's actor framework with +TorchTitan for scalable distributed training. + +**Further Reading** + +- Monarch also integrates with TorchFT to provide per-step fault-tolerance across replicated workers. You can find a comprehensive `proof of concept `_ of this integration in the TorchFT repo. +- For an interactive notebook covering similar topics to this tutorial, please consult `this Monarch example `_. \ No newline at end of file diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py index 62a49794af5..d70d5c5dca3 100644 --- a/intermediate_source/neural_tangent_kernels.py +++ b/intermediate_source/neural_tangent_kernels.py @@ -13,7 +13,7 @@ .. note:: - This tutorial requires PyTorch 2.0.0 or later. + This tutorial requires PyTorch 2.6.0 or later. Setup ----- @@ -24,7 +24,12 @@ import torch import torch.nn as nn from torch.func import functional_call, vmap, vjp, jvp, jacrev -device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu' + +if torch.accelerator.is_available() and torch.accelerator.device_count() > 0: + device = torch.accelerator.current_accelerator() +else: + device = torch.device("cpu") + class CNN(nn.Module): def __init__(self): diff --git a/intermediate_source/nlp_from_scratch_index.rst b/intermediate_source/nlp_from_scratch_index.rst new file mode 100644 index 00000000000..95f70746cbc --- /dev/null +++ b/intermediate_source/nlp_from_scratch_index.rst @@ -0,0 +1,48 @@ +NLP from Scratch +================ + +In these three-part series you will build and train +a basic character-level Recurrent Neural Network (RNN) to classify words. + +You will learn: + +* How to construct Recurrent Neural Networks from scratch +* Essential data handling techniques for NLP +* How to train an RNN to identify the language origin of words. + +Before you begin, we recommend that you review the following: + +* `PyTorch Learn the Basics series `__ +* `How to install PyTorch `__ + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 1: Classifying Names with a Character-Level RNN + :link: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html + :link-type: url + + Learn how to use an RNN to classify names into their language of origin. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 2: Generating Names with a Character-Level RNN + :link: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html + :link-type: url + + Expand the RNN we created in Part 1 to generate names from languages. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 3: Translation with a Sequence to Sequence Network and Attention + :link: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html + :link-type: url + + Create a sequence-to-sequence model that can translate your text from French + to English. + +++ + :octicon:`code;1em` Code + + diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py index ece80d3f94f..a5ece9303fc 100644 --- a/intermediate_source/per_sample_grads.py +++ b/intermediate_source/per_sample_grads.py @@ -169,7 +169,7 @@ def compute_loss(params, buffers, sample, target): # results of hand processing each one individually: for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads.values()): - assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=3e-3, rtol=1e-5) + assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=1.2e-1, rtol=1e-5) ###################################################################### # A quick note: there are limitations around what types of functions can be diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py new file mode 100644 index 00000000000..4d82a06a989 --- /dev/null +++ b/intermediate_source/pinmem_nonblock.py @@ -0,0 +1,769 @@ +# -*- coding: utf-8 -*- +""" +A guide on good usage of ``non_blocking`` and ``pin_memory()`` in PyTorch +========================================================================= + +**Author**: `Vincent Moens `_ + +Introduction +------------ + +Transferring data from the CPU to the GPU is fundamental in many PyTorch applications. +It's crucial for users to understand the most effective tools and options available for moving data between devices. +This tutorial examines two key methods for device-to-device data transfer in PyTorch: +:meth:`~torch.Tensor.pin_memory` and :meth:`~torch.Tensor.to` with the ``non_blocking=True`` option. + +What you will learn +~~~~~~~~~~~~~~~~~~~ + +Optimizing the transfer of tensors from the CPU to the GPU can be achieved through asynchronous transfers and memory +pinning. However, there are important considerations: + +- Using ``tensor.pin_memory().to(device, non_blocking=True)`` can be up to twice as slow as a straightforward ``tensor.to(device)``. +- Generally, ``tensor.to(device, non_blocking=True)`` is an effective choice for enhancing transfer speed. +- While ``cpu_tensor.to("cuda", non_blocking=True).mean()`` executes correctly, attempting + ``cuda_tensor.to("cpu", non_blocking=True).mean()`` will result in erroneous outputs. + +Preamble +~~~~~~~~ + +The performance reported in this tutorial are conditioned on the system used to build the tutorial. +Although the conclusions are applicable across different systems, the specific observations may vary slightly +depending on the hardware available, especially on older hardware. +The primary objective of this tutorial is to offer a theoretical framework for understanding CPU to GPU data transfers. +However, any design decisions should be tailored to individual cases and guided by benchmarked throughput measurements, +as well as the specific requirements of the task at hand. + +""" + +import torch + +assert torch.cuda.is_available(), "A cuda device is required to run this tutorial" + + +###################################################################### +# +# This tutorial requires tensordict to be installed. If you don't have tensordict in your environment yet, install it +# by running the following command in a separate cell: +# +# .. code-block:: bash +# +# # Install tensordict with the following command +# !pip3 install tensordict +# +# We start by outlining the theory surrounding these concepts, and then move to concrete test examples of the features. +# +# +# Background +# ---------- +# +# .. _pinned_memory_background: +# +# Memory management basics +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_memory: +# +# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed +# in memory. The memory we talk about here is a rather complex concept worth looking at carefully. +# We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity) +# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory) +# make up the virtual memory, which is an abstraction of the total resources available. +# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation +# and creates the illusion that the main memory is larger than it actually is. +# +# In normal circumstances, a regular CPU tensor is pageable which means that it is divided in blocks called pages that +# can live anywhere in the virtual memory (both in RAM or on disk). As mentioned earlier, this has the advantage that +# the memory seems larger than what the main memory actually is. +# +# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings +# back this page into RAM ("swap in" or "page in"). +# In turn, the OS may have to swap out (or "page out") another page to make room for the new page. +# +# In contrast to pageable memory, a pinned (or page-locked or non-pageable) memory is a type of memory that cannot +# be swapped out to disk. +# It allows for faster and more predictable access times, but has the downside that it is more limited than the +# pageable memory (aka the main memory). +# +# .. figure:: /_static/img/pinmem/pinmem.png +# :alt: +# +# CUDA and (non-)pageable memory +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_cuda_pageable_memory: +# +# To understand how CUDA copies a tensor from CPU to CUDA, let's consider the two scenarios above: +# +# - If the memory is page-locked, the device can access the memory directly in the main memory. The memory addresses are well +# defined and functions that need to read these data can be significantly accelerated. +# - If the memory is pageable, all the pages will have to be brought to the main memory before being sent to the GPU. +# This operation may take time and is less predictable than when executed on page-locked tensors. +# +# More precisely, when CUDA sends pageable data from CPU to GPU, it must first create a page-locked copy of that data +# before making the transfer. +# +# Asynchronous vs. Synchronous Operations with ``non_blocking=True`` (CUDA ``cudaMemcpyAsync``) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_async_sync: +# +# When executing a copy from a host (such as, CPU) to a device (such as, GPU), the CUDA toolkit offers modalities to do these +# operations synchronously or asynchronously with respect to the host. +# +# In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to +# `cudaMemcpyAsync `_. +# If ``non_blocking=False`` (default), a ``cudaStreamSynchronize`` will be called after each and every ``cudaMemcpyAsync``, making +# the call to :meth:`~torch.Tensor.to` blocking in the main thread. +# If ``non_blocking=True``, no synchronization is triggered, and the main thread on the host is not blocked. +# Therefore, from the host perspective, multiple tensors can be sent to the device simultaneously, +# as the thread does not need to wait for one transfer to be completed to initiate the other. +# +# .. note:: In general, the transfer is blocking on the device side (even if it isn't on the host side): +# the copy on the device cannot occur while another operation is being executed. +# However, in some advanced scenarios, a copy and a kernel execution can be done simultaneously on the GPU side. +# As the following example will show, three requirements must be met to enable this: +# +# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra, +# Tesla, or H100 devices have more than one DMA engine. +# +# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using +# :class:`~torch.cuda.Stream`. +# +# 3. The source data must be in pinned memory. +# +# We demonstrate this by running profiles on the following script. +# + +import contextlib + +from torch.cuda import Stream + + +s = Stream() + +torch.manual_seed(42) +t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True) +t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False) +t3_cuda = torch.randn(1024**2 * 5, device="cuda:0") + +assert torch.cuda.is_available() +device = torch.device("cuda", torch.cuda.current_device()) + + +# The function we want to profile +def inner(pinned: bool, streamed: bool): + with torch.cuda.stream(s) if streamed else contextlib.nullcontext(): + if pinned: + t1_cuda = t1_cpu_pinned.to(device, non_blocking=True) + else: + t2_cuda = t2_cpu_paged.to(device, non_blocking=True) + t_star_cuda_h2d_event = s.record_event() + # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is + # done in the other stream + t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda + t3_cuda_h2d_event = torch.cuda.current_stream().record_event() + t_star_cuda_h2d_event.synchronize() + t3_cuda_h2d_event.synchronize() + + +# Our profiler: profiles the `inner` function and stores the results in a .json file +def benchmark_with_profiler( + pinned, + streamed, +) -> None: + torch._C._profiler._set_cuda_sync_enabled_val(True) + wait, warmup, active = 1, 1, 2 + num_steps = wait + warmup + active + rank = 0 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1 + ), + ) as prof: + for step_idx in range(1, num_steps + 1): + inner(streamed=streamed, pinned=pinned) + if rank is None or rank == 0: + prof.step() + prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json") + + +###################################################################### +# Loading these profile traces in chrome (``chrome://tracing``) shows the following results: first, let's see +# what happens if both the arithmetic operation on ``t3_cuda`` is executed after the pageable tensor is sent to GPU +# in the main stream: +# + +benchmark_with_profiler(streamed=False, pinned=False) + +###################################################################### +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned0.png +# :alt: +# +# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively: + +benchmark_with_profiler(streamed=False, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned1.png +# :alt: +# +# Sending a pageable tensor to GPU on a separate stream is also a blocking operation: + +benchmark_with_profiler(streamed=True, pinned=False) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned0.png +# :alt: +# +# Only pinned tensors copies to GPU on a separate stream overlap with another cuda kernel executed on +# the main stream: + +benchmark_with_profiler(streamed=True, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned1.png +# :alt: +# +# A PyTorch perspective +# --------------------- +# +# .. _pinned_memory_pt_perspective: +# +# ``pin_memory()`` +# ~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_pinned: +# +# PyTorch offers the possibility to create and send tensors to page-locked memory through the +# :meth:`~torch.Tensor.pin_memory` method and constructor arguments. +# CPU tensors on a machine where CUDA is initialized can be cast to pinned memory through the :meth:`~torch.Tensor.pin_memory` +# method. Importantly, ``pin_memory`` is blocking on the main thread of the host: it will wait for the tensor to be copied to +# page-locked memory before executing the next operation. +# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other +# constructors. +# +# Let us check the speed of pinning memory and sending tensors to CUDA: + + +import torch +import gc +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + + +def timer(cmd): + median = ( + Timer(cmd, globals=globals()) + .adaptive_autorange(min_run_time=1.0, max_run_time=20.0) + .median + * 1000 + ) + print(f"{cmd}: {median: 4.4f} ms") + return median + + +# A tensor in pageable memory +pageable_tensor = torch.randn(1_000_000) + +# A tensor in page-locked (pinned) memory +pinned_tensor = torch.randn(1_000_000, pin_memory=True) + +# Runtimes: +pageable_to_device = timer("pageable_tensor.to('cuda:0')") +pinned_to_device = timer("pinned_tensor.to('cuda:0')") +pin_mem = timer("pageable_tensor.pin_memory()") +pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')") + +# Ratios: +r1 = pinned_to_device / pageable_to_device +r2 = pin_mem_to_device / pageable_to_device + +# Create a figure with the results +fig, ax = plt.subplots() + +xlabels = [0, 1, 2] +bar_labels = [ + "pageable_tensor.to(device) (1x)", + f"pinned_tensor.to(device) ({r1:4.2f}x)", + f"pageable_tensor.pin_memory().to(device) ({r2:4.2f}x)" + f"\npin_memory()={100*pin_mem/pin_mem_to_device:.2f}% of runtime.", +] +values = [pageable_to_device, pinned_to_device, pin_mem_to_device] +colors = ["tab:blue", "tab:red", "tab:orange"] +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (pin-memory)") +ax.set_xticks([]) +ax.legend() + +plt.show() + +# Clear tensors +del pageable_tensor, pinned_tensor +_ = gc.collect() + +###################################################################### +# +# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under +# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU. +# +# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before +# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just +# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will +# perform anyway before copying the data from host to device. +# +# .. note:: The PyTorch implementation of +# `pin_memory `_ +# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc `_ +# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does. +# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or +# the amount of available RAM. +# +# ``non_blocking=True`` +# ~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_non_blocking: +# +# As mentioned earlier, many PyTorch operations have the option of being executed asynchronously with respect to the host +# through the ``non_blocking`` argument. +# +# Here, to account accurately of the benefits of using ``non_blocking``, we will design a slightly more complex +# experiment since we want to assess how fast it is to send multiple tensors to GPU with and without calling +# ``non_blocking``. +# + + +# A simple loop that copies all tensors to cuda +def copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0")) + return result + + +# A loop that copies all tensors to cuda asynchronously +def copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +# Create a list of tensors +tensors = [torch.randn(1000) for _ in range(1000)] +to_device = timer("copy_to_device(*tensors)") +to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)") + +# Ratio +r1 = to_device_nonblocking / to_device + +# Plot the results +fig, ax = plt.subplots() + +xlabels = [0, 1] +bar_labels = [f"to(device) (1x)", f"to(device, non_blocking=True) ({r1:4.2f}x)"] +colors = ["tab:blue", "tab:red"] +values = [to_device, to_device_nonblocking] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (non-blocking)") +ax.set_xticks([]) +ax.legend() + +plt.show() + + +###################################################################### +# To get a better sense of what is happening here, let us profile these two functions: + + +from torch.profiler import profile, ProfilerActivity + + +def profile_mem(cmd): + with profile(activities=[ProfilerActivity.CPU]) as prof: + exec(cmd) + print(cmd) + print(prof.key_averages().table(row_limit=10)) + + +###################################################################### +# Let's see the call stack with a regular ``to(device)`` first: +# + +print("Call to `to(device)`", profile_mem("copy_to_device(*tensors)")) + +###################################################################### +# and now the ``non_blocking`` version: +# + +print( + "Call to `to(device, non_blocking=True)`", + profile_mem("copy_to_device_nonblocking(*tensors)"), +) + + +###################################################################### +# The results are without any doubt better when using ``non_blocking=True``, as all transfers are initiated simultaneously +# on the host side and only one synchronization is done. +# +# The benefit will vary depending on the number and the size of the tensors as well as depending on the hardware being +# used. +# +# .. note:: Interestingly, the blocking ``to("cuda")`` actually performs the same asynchronous device casting operation +# (``cudaMemcpyAsync``) as the one with ``non_blocking=True`` with a synchronization point after each copy. +# +# Synergies +# ~~~~~~~~~ +# +# .. _pinned_memory_synergies: +# +# Now that we have made the point that data transfer of tensors already in pinned memory to GPU is faster than from +# pageable memory, and that we know that doing these transfers asynchronously is also faster than synchronously, we can +# benchmark combinations of these approaches. First, let's write a couple of new functions that will call ``pin_memory`` +# and ``to(device)`` on each tensor: +# + + +def pin_copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0")) + return result + + +def pin_copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +###################################################################### +# The benefits of using :meth:`~torch.Tensor.pin_memory` are more pronounced for +# somewhat large batches of large tensors: +# + +tensors = [torch.randn(1_000_000) for _ in range(1000)] +page_copy = timer("copy_to_device(*tensors)") +page_copy_nb = timer("copy_to_device_nonblocking(*tensors)") + +tensors_pinned = [torch.randn(1_000_000, pin_memory=True) for _ in range(1000)] +pinned_copy = timer("copy_to_device(*tensors_pinned)") +pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned)") + +pin_and_copy = timer("pin_copy_to_device(*tensors)") +pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)") + +# Plot +strategies = ("pageable copy", "pinned copy", "pin and copy") +blocking = { + "blocking": [page_copy, pinned_copy, pin_and_copy], + "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb], +} + +x = torch.arange(3) +width = 0.25 +multiplier = 0 + + +fig, ax = plt.subplots(layout="constrained") + +for attribute, runtimes in blocking.items(): + offset = width * multiplier + rects = ax.bar(x + offset, runtimes, width, label=attribute) + ax.bar_label(rects, padding=3, fmt="%.2f") + multiplier += 1 + +# Add some text for labels, title and custom x-axis tick labels, etc. +ax.set_ylabel("Runtime (ms)") +ax.set_title("Runtime (pin-mem and non-blocking)") +ax.set_xticks([0, 1, 2]) +ax.set_xticklabels(strategies) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") +ax.legend(loc="upper left", ncols=3) + +plt.show() + +del tensors, tensors_pinned +_ = gc.collect() + + +###################################################################### +# Other copy directions (GPU -> CPU, CPU -> MPS) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_other_direction: +# +# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe. +# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is +# valid at read time __whenever the tensor is in pageable memory__. +# +# However, in other cases we cannot make the same assumption: when a tensor is placed in pinned memory, mutating the +# original copy after calling the host-to-device transfer may corrupt the data received on GPU. +# Similarly, when a transfer is achieved in the opposite direction, from GPU to CPU, or from any device that is not CPU +# or GPU to any device that is not a CUDA-handled GPU (such as, MPS), there is no guarantee that the data read on GPU is +# valid without explicit synchronization. +# +# In these scenarios, these transfers offer no assurance that the copy will be complete at the time of +# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage. +# +# Let's first demonstrate this with a pinned-memory tensor: +DELAY = 100000000 +try: + i = -1 + for i in range(100): + # Create a tensor in pin-memory + cpu_tensor = torch.ones(1024, 1024, pin_memory=True) + torch.cuda.synchronize() + # Send the tensor to CUDA + cuda_tensor = cpu_tensor.to("cuda", non_blocking=True) + torch.cuda._sleep(DELAY) + # Corrupt the original tensor + cpu_tensor.zero_() + assert (cuda_tensor == 1).all() + print("No test failed with non_blocking and pinned tensor") +except AssertionError: + print(f"{i}th test failed with non_blocking and pinned tensor. Skipping remaining tests") + +###################################################################### +# Using a pageable tensor always works: +# + +i = -1 +for i in range(100): + # Create a tensor in pageable memory + cpu_tensor = torch.ones(1024, 1024) + torch.cuda.synchronize() + # Send the tensor to CUDA + cuda_tensor = cpu_tensor.to("cuda", non_blocking=True) + torch.cuda._sleep(DELAY) + # Corrupt the original tensor + cpu_tensor.zero_() + assert (cuda_tensor == 1).all() +print("No test failed with non_blocking and pageable tensor") + +###################################################################### +# Now let's demonstrate that CUDA to CPU also fails to produce reliable outputs without synchronization: + +tensor = ( + torch.arange(1, 1_000_000, dtype=torch.double, device="cuda") + .expand(100, 999999) + .clone() +) +torch.testing.assert_close( + tensor.mean(), torch.tensor(500_000, dtype=torch.double, device="cuda") +), tensor.mean() +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with non_blocking") +except AssertionError: + print(f"{i}th test failed with non_blocking. Skipping remaining tests") +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.cuda.synchronize() + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with synchronize") +except AssertionError: + print(f"One test failed with synchronize: {i}th assertion!") + + +###################################################################### +# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a +# CUDA-enabled device and the original tensor is in pageable memory. +# +# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction, +# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before +# the data is accessed. +# +# Practical recommendations +# ------------------------- +# +# .. _pinned_memory_recommendations: +# +# We can now wrap up some early recommendations based on our observations: +# +# In general, ``non_blocking=True`` will provide good throughput, regardless of whether the original tensor is or +# isn't in pinned memory. +# If the tensor is already in pinned memory, the transfer can be accelerated, but sending it to +# pin memory manually from python main thread is a blocking operation on the host, and hence will annihilate much of +# the benefit of using ``non_blocking=True`` (as CUDA does the `pin_memory` transfer anyway). +# +# One might now legitimately ask what use there is for the :meth:`~torch.Tensor.pin_memory` method. +# In the following section, we will explore further how this can be used to accelerate the data transfer even more. +# +# Additional considerations +# ------------------------- +# +# .. _pinned_memory_considerations: +# +# PyTorch notoriously provides a :class:`~torch.utils.data.DataLoader` class whose constructor accepts a +# ``pin_memory`` argument. +# Considering our previous discussion on ``pin_memory``, you might wonder how the ``DataLoader`` manages to +# accelerate data transfers if memory pinning is inherently blocking. +# +# The key lies in the DataLoader's use of a separate thread to handle the transfer of data from pageable to pinned +# memory, thus preventing any blockage in the main thread. +# +# To illustrate this, we will use the TensorDict primitive from the homonymous library. +# When invoking :meth:`~tensordict.TensorDict.to`, the default behavior is to send tensors to the device asynchronously, +# followed by a single call to ``torch.device.synchronize()`` afterwards. +# +# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option which initiates multiple threads to execute +# ``pin_memory()`` before proceeding with to ``to(device)``. +# This approach can further accelerate data transfers, as demonstrated in the following example. +# +# + +from tensordict import TensorDict +import torch +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + +# Create the dataset +td = TensorDict({str(i): torch.randn(1_000_000) for i in range(1000)}) + +# Runtimes +copy_blocking = timer("td.to('cuda:0', non_blocking=False)") +copy_non_blocking = timer("td.to('cuda:0')") +copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)") +copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)") + +# Rations +r1 = copy_non_blocking / copy_blocking +r2 = copy_pin_nb / copy_blocking +r3 = copy_pin_multithread_nb / copy_blocking + +# Figure +fig, ax = plt.subplots() + +xlabels = [0, 1, 2, 3] +bar_labels = [ + "Blocking copy (1x)", + f"Non-blocking copy ({r1:4.2f}x)", + f"Blocking pin, non-blocking copy ({r2:4.2f}x)", + f"Non-blocking pin, non-blocking copy ({r3:4.2f}x)", +] +values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb] +colors = ["tab:blue", "tab:red", "tab:orange", "tab:green"] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime") +ax.set_xticks([]) +ax.legend() + +plt.show() + +###################################################################### +# In this example, we are transferring many large tensors from the CPU to the GPU. +# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance. +# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits. +# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited. +# +# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle +# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite +# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor. +# +# Moreover, transferring data that resides on disk (whether in shared memory or files) to the GPU typically requires an +# intermediate step of copying the data into pinned memory (located in RAM). +# Utilizing non_blocking for large data transfers in this context can significantly increase RAM consumption, +# potentially leading to adverse effects. +# +# In practice, there is no one-size-fits-all solution. +# The effectiveness of using multithreaded ``pin_memory`` combined with ``non_blocking`` transfers depends on a +# variety of factors, including the specific system, operating system, hardware, and the nature of the tasks +# being executed. +# Here is a list of factors to check when trying to speed-up data transfers between CPU and GPU, or comparing +# throughput's across scenarios: +# +# - **Number of available cores** +# +# How many CPU cores are available? Is the system shared with other users or processes that might compete for +# resources? +# +# - **Core utilization** +# +# Are the CPU cores heavily utilized by other processes? Does the application perform other CPU-intensive tasks +# concurrently with data transfers? +# +# - **Memory utilization** +# +# How much pageable and page-locked memory is currently being used? Is there sufficient free memory to allocate +# additional pinned memory without affecting system performance? Remember that nothing comes for free, for instance +# ``pin_memory`` will consume RAM and may impact other tasks. +# +# - **CUDA Device Capabilities** +# +# Does the GPU support multiple DMA engines for concurrent data transfers? What are the specific capabilities and +# limitations of the CUDA device being used? +# +# - **Number of tensors to be sent** +# +# How many tensors are transferred in a typical operation? +# +# - **Size of the tensors to be sent** +# +# What is the size of the tensors being transferred? A few large tensors or many small tensors may not benefit from +# the same transfer program. +# +# - **System Architecture** +# +# How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)? +# +# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial +# portion of RAM. +# This reduces the available memory for other critical operations, such as paging, which can negatively impact the +# overall performance of an algorithm. +# +# Conclusion +# ---------- +# +# .. _pinned_memory_conclusion: +# +# Throughout this tutorial, we have explored several critical factors that influence transfer speeds and memory +# management when sending tensors from the host to the device. We've learned that using ``non_blocking=True`` generally +# accelerates data transfers, and that :meth:`~torch.Tensor.pin_memory` can also enhance performance if implemented +# correctly. However, these techniques require careful design and calibration to be effective. +# +# Remember that profiling your code and keeping an eye on the memory consumption are essential to optimize resource +# usage and achieve the best possible performance. +# +# Additional resources +# -------------------- +# +# .. _pinned_memory_resources: +# +# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about +# what was discussed in this tutorial, check the following references: +# +# - `CUDA toolkit memory management doc `_; +# - `CUDA pin-memory note `_; +# - `How to Optimize Data Transfers in CUDA C/C++ `_; +# - `tensordict doc `_ and `repo `_. +# diff --git a/intermediate_source/pipeline_tutorial.rst b/intermediate_source/pipeline_tutorial.rst new file mode 100644 index 00000000000..06f10a4a884 --- /dev/null +++ b/intermediate_source/pipeline_tutorial.rst @@ -0,0 +1,11 @@ +Training Transformer models using Pipeline Parallelism +====================================================== + +This tutorial has been deprecated. + +Redirecting to the latest parallelism APIs in 3 seconds... + +.. raw:: html + + + diff --git a/intermediate_source/pipelining_tutorial.rst b/intermediate_source/pipelining_tutorial.rst index 3d6533cef2b..63170e6064d 100644 --- a/intermediate_source/pipelining_tutorial.rst +++ b/intermediate_source/pipelining_tutorial.rst @@ -12,6 +12,7 @@ APIs. .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites * How to use ``torch.distributed.pipelining`` APIs * How to apply pipeline parallelism to a transformer model @@ -19,6 +20,7 @@ APIs. .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites * Familiarity with `basic distributed training `__ in PyTorch @@ -65,7 +67,7 @@ chunks. First, let us define the model: h = layer(h, h) h = self.norm(h) if self.norm else h - output = self.output(h).float() if self.output else h + output = self.output(h).clone() if self.output else h return output Then, we need to import the necessary libraries in our script and initialize the distributed training process. In this case, we are defining some global variables to use @@ -107,32 +109,29 @@ Step 1: Partition the Transformer Model There are two different ways of partitioning the model: First is the manual mode in which we can manually create two instances of the model by deleting portions of -attributes of the model. In this example for a 2 stage (2 ranks) the model is cut in half. +attributes of the model. In this example for two stages (2 ranks), the model is cut in half. .. code:: python - def manual_model_split(model, example_input_microbatch, model_args) -> PipelineStage: + def manual_model_split(model) -> PipelineStage: if stage_index == 0: # prepare the first stage model for i in range(4, 8): del model.layers[str(i)] model.norm = None model.output = None - stage_input_microbatch = example_input_microbatch elif stage_index == 1: # prepare the second stage model for i in range(4): del model.layers[str(i)] model.tok_embeddings = None - stage_input_microbatch = torch.randn(example_input_microbatch.shape[0], example_input_microbatch.shape[1], model_args.dim) stage = PipelineStage( model, stage_index, num_stages, device, - input_args=stage_input_microbatch, ) return stage @@ -146,6 +145,7 @@ we are splitting before the before 4th transformer decoder layer, mirroring the we can retrieve a ``PipelineStage`` by calling ``build_stage`` after this splitting is done. .. code:: python + def tracer_model_split(model, example_input_microbatch) -> PipelineStage: pipe = pipeline( module=model, @@ -179,18 +179,19 @@ as well as multiple-stage-per-rank schedules such as ``Interleaved1F1B`` and ``L example_input_microbatch = x.chunk(num_microbatches)[0] # Option 1: Manual model splitting - stage = manual_model_split(model, example_input_microbatch, model_args) + stage = manual_model_split(model) # Option 2: Tracer model splitting # stage = tracer_model_split(model, example_input_microbatch) + model.to(device) x = x.to(device) y = y.to(device) def tokenwise_loss_fn(outputs, targets): loss_fn = nn.CrossEntropyLoss() - outputs = outputs.view(-1, model_args.vocab_size) - targets = targets.view(-1) + outputs = outputs.reshape(-1, model_args.vocab_size) + targets = targets.reshape(-1) return loss_fn(outputs, targets) schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn) @@ -200,6 +201,7 @@ as well as multiple-stage-per-rank schedules such as ``Interleaved1F1B`` and ``L elif rank == 1: losses = [] output = schedule.step(target=y, losses=losses) + print(f"losses: {losses}") dist.destroy_process_group() In the example above, we are using the manual method to split the model, but the code can be uncommented to also try the @@ -230,5 +232,10 @@ We discussed two methods of model partitioning, manual and tracer-based, and dem micro-batches across different stages. Finally, we covered the execution of the pipeline schedule and the launch of distributed processes using ``torchrun``. -For a production ready usage of pipeline parallelism as well as composition with other distributed techniques, see also +Additional Resources +-------------------- + +We have successfully integrated ``torch.distributed.pipelining`` into the `torchtitan repository `__. TorchTitan is a clean, minimal code base for +large-scale LLM training using native PyTorch. For a production ready usage of pipeline +parallelism as well as composition with other distributed techniques, see `TorchTitan end to end example of 3D parallelism `__. diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst index 47379bf8818..3c72a9e319b 100644 --- a/intermediate_source/process_group_cpp_extension_tutorial.rst +++ b/intermediate_source/process_group_cpp_extension_tutorial.rst @@ -25,9 +25,8 @@ Basics PyTorch collective communications power several widely adopted distributed training features, including -`DistributedDataParallel `__, -`ZeroRedundancyOptimizer `__, -`FullyShardedDataParallel `__. +`DistributedDataParallel `__ and +`ZeroRedundancyOptimizer `__. In order to make the same collective communication API work with different communication backends, the distributed package abstracts collective communication operations into a diff --git a/intermediate_source/quantized_transfer_learning_tutorial.rst b/intermediate_source/quantized_transfer_learning_tutorial.rst deleted file mode 100644 index 9ba5e92d197..00000000000 --- a/intermediate_source/quantized_transfer_learning_tutorial.rst +++ /dev/null @@ -1,516 +0,0 @@ -(beta) Quantized Transfer Learning for Computer Vision Tutorial -======================================================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. - This will allow you to experiment with the information presented below. - -**Author**: `Zafar Takhirov `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - -This tutorial builds on the original `PyTorch Transfer Learning `_ -tutorial, written by `Sasank Chilamkurthy `_. - -Transfer learning refers to techniques that make use of a pretrained model for -application on a different data-set. -There are two main ways the transfer learning is used: - -1. **ConvNet as a fixed feature extractor**: Here, you `“freeze” `_ - the weights of all the parameters in the network except that of the final - several layers (aka “the head”, usually fully connected layers). - These last layers are replaced with new ones initialized with random - weights and only these layers are trained. -2. **Finetuning the ConvNet**: Instead of random initializaion, the model is - initialized using a pretrained network, after which the training proceeds as - usual but with a different dataset. - Usually the head (or part of it) is also replaced in the network in - case there is a different number of outputs. - It is common in this method to set the learning rate to a smaller number. - This is done because the network is already trained, and only minor changes - are required to "finetune" it to a new dataset. - -You can also combine the above two methods: -First you can freeze the feature extractor, and train the head. After -that, you can unfreeze the feature extractor (or part of it), set the -learning rate to something smaller, and continue training. - -In this part you will use the first method – extracting the features -using a quantized model. - - -Part 0. Prerequisites ---------------------- - -Before diving into the transfer learning, let us review the "prerequisites", -such as installations and data loading/visualizations. - -.. code:: python - - # Imports - import copy - import matplotlib.pyplot as plt - import numpy as np - import os - import time - - plt.ion() - -Installing the Nightly Build -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Because you will be using the beta parts of the PyTorch, it is -recommended to install the latest version of ``torch`` and -``torchvision``. You can find the most recent instructions on local -installation `here `_. -For example, to install without GPU support: - -.. code:: shell - - pip install numpy - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # For CUDA support use https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - -Load Data -~~~~~~~~~ - -.. note :: This section is identical to the original transfer learning tutorial. -We will use ``torchvision`` and ``torch.utils.data`` packages to load -the data. - -The problem you are going to solve today is classifying **ants** and -**bees** from images. The dataset contains about 120 training images -each for ants and bees. There are 75 validation images for each class. -This is considered a very small dataset to generalize on. However, since -we are using transfer learning, we should be able to generalize -reasonably well. - -*This dataset is a very small subset of imagenet.* - -.. note :: Download the data from `here `_ - and extract it to the ``data`` directory. - -.. code:: python - - import torch - from torchvision import transforms, datasets - - # Data augmentation and normalization for training - # Just normalization for validation - data_transforms = { - 'train': transforms.Compose([ - transforms.Resize(224), - transforms.RandomCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - 'val': transforms.Compose([ - transforms.Resize(224), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - } - - data_dir = 'data/hymenoptera_data' - image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), - data_transforms[x]) - for x in ['train', 'val']} - dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16, - shuffle=True, num_workers=8) - for x in ['train', 'val']} - dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} - class_names = image_datasets['train'].classes - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - -Visualize a few images -~~~~~~~~~~~~~~~~~~~~~~ - -Let’s visualize a few training images so as to understand the data -augmentations. - -.. code:: python - - import torchvision - - def imshow(inp, title=None, ax=None, figsize=(5, 5)): - """Imshow for Tensor.""" - inp = inp.numpy().transpose((1, 2, 0)) - mean = np.array([0.485, 0.456, 0.406]) - std = np.array([0.229, 0.224, 0.225]) - inp = std * inp + mean - inp = np.clip(inp, 0, 1) - if ax is None: - fig, ax = plt.subplots(1, figsize=figsize) - ax.imshow(inp) - ax.set_xticks([]) - ax.set_yticks([]) - if title is not None: - ax.set_title(title) - - # Get a batch of training data - inputs, classes = next(iter(dataloaders['train'])) - - # Make a grid from batch - out = torchvision.utils.make_grid(inputs, nrow=4) - - fig, ax = plt.subplots(1, figsize=(10, 10)) - imshow(out, title=[class_names[x] for x in classes], ax=ax) - - -Support Function for Model Training -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Below is a generic function for model training. -This function also - -- Schedules the learning rate -- Saves the best model - -.. code:: python - - def train_model(model, criterion, optimizer, scheduler, num_epochs=25, device='cpu'): - """ - Support function for model training. - - Args: - model: Model to be trained - criterion: Optimization criterion (loss) - optimizer: Optimizer to use for training - scheduler: Instance of ``torch.optim.lr_scheduler`` - num_epochs: Number of epochs - device: Device to run the training on. Must be 'cpu' or 'cuda' - """ - since = time.time() - - best_model_wts = copy.deepcopy(model.state_dict()) - best_acc = 0.0 - - for epoch in range(num_epochs): - print('Epoch {}/{}'.format(epoch, num_epochs - 1)) - print('-' * 10) - - # Each epoch has a training and validation phase - for phase in ['train', 'val']: - if phase == 'train': - model.train() # Set model to training mode - else: - model.eval() # Set model to evaluate mode - - running_loss = 0.0 - running_corrects = 0 - - # Iterate over data. - for inputs, labels in dataloaders[phase]: - inputs = inputs.to(device) - labels = labels.to(device) - - # zero the parameter gradients - optimizer.zero_grad() - - # forward - # track history if only in train - with torch.set_grad_enabled(phase == 'train'): - outputs = model(inputs) - _, preds = torch.max(outputs, 1) - loss = criterion(outputs, labels) - - # backward + optimize only if in training phase - if phase == 'train': - loss.backward() - optimizer.step() - - # statistics - running_loss += loss.item() * inputs.size(0) - running_corrects += torch.sum(preds == labels.data) - if phase == 'train': - scheduler.step() - - epoch_loss = running_loss / dataset_sizes[phase] - epoch_acc = running_corrects.double() / dataset_sizes[phase] - - print('{} Loss: {:.4f} Acc: {:.4f}'.format( - phase, epoch_loss, epoch_acc)) - - # deep copy the model - if phase == 'val' and epoch_acc > best_acc: - best_acc = epoch_acc - best_model_wts = copy.deepcopy(model.state_dict()) - - print() - - time_elapsed = time.time() - since - print('Training complete in {:.0f}m {:.0f}s'.format( - time_elapsed // 60, time_elapsed % 60)) - print('Best val Acc: {:4f}'.format(best_acc)) - - # load best model weights - model.load_state_dict(best_model_wts) - return model - - -Support Function for Visualizing the Model Predictions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Generic function to display predictions for a few images - -.. code:: python - - def visualize_model(model, rows=3, cols=3): - was_training = model.training - model.eval() - current_row = current_col = 0 - fig, ax = plt.subplots(rows, cols, figsize=(cols*2, rows*2)) - - with torch.no_grad(): - for idx, (imgs, lbls) in enumerate(dataloaders['val']): - imgs = imgs.cpu() - lbls = lbls.cpu() - - outputs = model(imgs) - _, preds = torch.max(outputs, 1) - - for jdx in range(imgs.size()[0]): - imshow(imgs.data[jdx], ax=ax[current_row, current_col]) - ax[current_row, current_col].axis('off') - ax[current_row, current_col].set_title('predicted: {}'.format(class_names[preds[jdx]])) - - current_col += 1 - if current_col >= cols: - current_row += 1 - current_col = 0 - if current_row >= rows: - model.train(mode=was_training) - return - model.train(mode=was_training) - - -Part 1. Training a Custom Classifier based on a Quantized Feature Extractor ---------------------------------------------------------------------------- - -In this section you will use a “frozen” quantized feature extractor, and -train a custom classifier head on top of it. Unlike floating point -models, you don’t need to set requires_grad=False for the quantized -model, as it has no trainable parameters. Please, refer to the -`documentation `_ for -more details. - -Load a pretrained model: for this exercise you will be using -`ResNet-18 `_. - -.. code:: python - - import torchvision.models.quantization as models - - # You will need the number of filters in the `fc` for future use. - # Here the size of each output sample is set to 2. - # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). - model_fe = models.resnet18(pretrained=True, progress=True, quantize=True) - num_ftrs = model_fe.fc.in_features - - -At this point you need to modify the pretrained model. The model -has the quantize/dequantize blocks in the beginning and the end. However, -because you will only use the feature extractor, the dequantization layer has -to move right before the linear layer (the head). The easiest way to do that -is to wrap the model in the ``nn.Sequential`` module. - -The first step is to isolate the feature extractor in the ResNet -model. Although in this example you are tasked to use all layers except -``fc`` as the feature extractor, in reality, you can take as many parts -as you need. This would be useful in case you would like to replace some -of the convolutional layers as well. - - -.. note:: When separating the feature extractor from the rest of a quantized - model, you have to manually place the quantizer/dequantized in the - beginning and the end of the parts you want to keep quantized. - -The function below creates a model with a custom head. - -.. code:: python - - from torch import nn - - def create_combined_model(model_fe): - # Step 1. Isolate the feature extractor. - model_fe_features = nn.Sequential( - model_fe.quant, # Quantize the input - model_fe.conv1, - model_fe.bn1, - model_fe.relu, - model_fe.maxpool, - model_fe.layer1, - model_fe.layer2, - model_fe.layer3, - model_fe.layer4, - model_fe.avgpool, - model_fe.dequant, # Dequantize the output - ) - - # Step 2. Create a new "head" - new_head = nn.Sequential( - nn.Dropout(p=0.5), - nn.Linear(num_ftrs, 2), - ) - - # Step 3. Combine, and don't forget the quant stubs. - new_model = nn.Sequential( - model_fe_features, - nn.Flatten(1), - new_head, - ) - return new_model - -.. warning:: Currently the quantized models can only be run on CPU. - However, it is possible to send the non-quantized parts of the model to a GPU. - -.. code:: python - - import torch.optim as optim - new_model = create_combined_model(model_fe) - new_model = new_model.to('cpu') - - criterion = nn.CrossEntropyLoss() - - # Note that we are only training the head. - optimizer_ft = optim.SGD(new_model.parameters(), lr=0.01, momentum=0.9) - - # Decay LR by a factor of 0.1 every 7 epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) - - -Train and evaluate -~~~~~~~~~~~~~~~~~~ - -This step takes around 15-25 min on CPU. Because the quantized model can -only run on the CPU, you cannot run the training on GPU. - -.. code:: python - - new_model = train_model(new_model, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device='cpu') - - visualize_model(new_model) - plt.tight_layout() - - -Part 2. Finetuning the Quantizable Model ----------------------------------------- - -In this part, we fine tune the feature extractor used for transfer -learning, and quantize the feature extractor. Note that in both part 1 -and 2, the feature extractor is quantized. The difference is that in -part 1, we use a pretrained quantized model. In this part, we create a -quantized feature extractor after fine tuning on the data-set of -interest, so this is a way to get better accuracy with transfer learning -while having the benefits of quantization. Note that in our specific -example, the training set is really small (120 images) so the benefits -of fine tuning the entire model is not apparent. However, the procedure -shown here will improve accuracy for transfer learning with larger -datasets. - -The pretrained feature extractor must be quantizable. -To make sure it is quantizable, perform the following steps: - - 1. Fuse ``(Conv, BN, ReLU)``, ``(Conv, BN)``, and ``(Conv, ReLU)`` using - ``torch.quantization.fuse_modules``. - 2. Connect the feature extractor with a custom head. - This requires dequantizing the output of the feature extractor. - 3. Insert fake-quantization modules at appropriate locations - in the feature extractor to mimic quantization during training. - -For step (1), we use models from ``torchvision/models/quantization``, which -have a member method ``fuse_model``. This function fuses all the ``conv``, -``bn``, and ``relu`` modules. For custom models, this would require calling -the ``torch.quantization.fuse_modules`` API with the list of modules to fuse -manually. - -Step (2) is performed by the ``create_combined_model`` function -used in the previous section. - -Step (3) is achieved by using ``torch.quantization.prepare_qat``, which -inserts fake-quantization modules. - - -As step (4), you can start "finetuning" the model, and after that convert -it to a fully quantized version (Step 5). - -To convert the fine tuned model into a quantized model you can call the -``torch.quantization.convert`` function (in our case only -the feature extractor is quantized). - -.. note:: Because of the random initialization your results might differ from - the results shown in this tutorial. - -.. code:: python - - # notice `quantize=False` - model = models.resnet18(pretrained=True, progress=True, quantize=False) - num_ftrs = model.fc.in_features - - # Step 1 - model.train() - model.fuse_model() - # Step 2 - model_ft = create_combined_model(model) - model_ft[0].qconfig = torch.quantization.default_qat_qconfig # Use default QAT configuration - # Step 3 - model_ft = torch.quantization.prepare_qat(model_ft, inplace=True) - - -Finetuning the model -~~~~~~~~~~~~~~~~~~~~ - -In the current tutorial the whole model is fine tuned. In -general, this will lead to higher accuracy. However, due to the small -training set used here, we end up overfitting to the training set. - - -Step 4. Fine tune the model - -.. code:: python - - for param in model_ft.parameters(): - param.requires_grad = True - - model_ft.to(device) # We can fine-tune on GPU if available - - criterion = nn.CrossEntropyLoss() - - # Note that we are training everything, so the learning rate is lower - # Notice the smaller learning rate - optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9, weight_decay=0.1) - - # Decay LR by a factor of 0.3 every several epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.3) - - model_ft_tuned = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device=device) - -Step 5. Convert to quantized model - -.. code:: python - - from torch.quantization import convert - model_ft_tuned.cpu() - - model_quantized_and_trained = convert(model_ft_tuned, inplace=False) - - -Lets see how the quantized model performs on a few images - -.. code:: python - - visualize_model(model_quantized_and_trained) - - plt.ioff() - plt.tight_layout() - plt.show() diff --git a/intermediate_source/realtime_rpi.rst b/intermediate_source/realtime_rpi.rst index bb1a576a2c2..2e00b68d2f1 100644 --- a/intermediate_source/realtime_rpi.rst +++ b/intermediate_source/realtime_rpi.rst @@ -1,10 +1,10 @@ -Real Time Inference on Raspberry Pi 4 (30 fps!) +Real Time Inference on Raspberry Pi 4 and 5 (40 fps!) ================================================= **Author**: `Tristan Rice `_ -PyTorch has out of the box support for Raspberry Pi 4. This tutorial will guide -you on how to setup a Raspberry Pi 4 for running PyTorch and run a MobileNet v2 -classification model in real time (30 fps+) on the CPU. +PyTorch has out of the box support for Raspberry Pi 4 and 5. This tutorial will guide +you on how to setup a Raspberry Pi for running PyTorch and run a MobileNet v2 +classification model in real time (30-40 fps) on the CPU. This was all tested with Raspberry Pi 4 Model B 4GB but should work with the 2GB variant as well as on the 3B with reduced performance. @@ -12,9 +12,9 @@ variant as well as on the 3B with reduced performance. .. image:: https://user-images.githubusercontent.com/909104/153093710-bc736b6f-69d9-4a50-a3e8-9f2b2c9e04fd.gif Prerequisites -~~~~~~~~~~~~~~~~ +--------------- -To follow this tutorial you'll need a Raspberry Pi 4, a camera for it and all +To follow this tutorial you'll need a Raspberry Pi 4 or 5, a camera for it and all the other standard accessories. * `Raspberry Pi 4 Model B 2GB+ `_ @@ -25,12 +25,12 @@ the other standard accessories. * SD card read/writer -Raspberry Pi 4 Setup -~~~~~~~~~~~~~~~~~~~~~~~ +Raspberry Pi Setup +---------------------- PyTorch only provides pip packages for Arm 64bit (aarch64) so you'll need to install a 64 bit version of the OS on your Raspberry Pi -You can download the latest arm64 Raspberry Pi OS from https://downloads.raspberrypi.org/raspios_arm64/images/ and install it via rpi-imager. +You'll need to install the `official rpi-imager `_ to install Rasbperry Pi OS. **32-bit Raspberry Pi OS will not work.** @@ -45,7 +45,12 @@ Time to put your sdcard in your Raspberry Pi, connect the camera and boot it up. .. image:: https://user-images.githubusercontent.com/909104/152869862-c239c980-b089-4bd5-84eb-0a1e5cf22df2.png -Once that boots and you complete the initial setup you'll need to edit the ``/boot/config.txt`` file to enable the camera. +Raspberry Pi 4 Config +~~~~~~~~~~~~~~~~~~~~~~~~ + +If you're using a Raspberry Pi 4, you'll need some additional config changes. These changes are not required on Raspberry Pi 5. + +Once the OS boots and you complete the initial setup you'll need to edit the ``/boot/config.txt`` file to enable the camera. .. code:: toml @@ -55,21 +60,17 @@ Once that boots and you complete the initial setup you'll need to edit the ``/bo # This needs to be at least 128M for the camera processing, if it's bigger you can just leave it as is. gpu_mem=128 - # You need to commment/remove the existing camera_auto_detect line since this causes issues with OpenCV/V4L2 capture. - #camera_auto_detect=1 - -And then reboot. After you reboot the video4linux2 device ``/dev/video0`` should exist. +And then reboot. -Installing PyTorch and OpenCV -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing PyTorch and picamera2 +------------------------------- PyTorch and all the other libraries we need have ARM 64-bit/aarch64 variants so you can just install them via pip and have it work like any other Linux system. .. code:: shell - $ pip install torch torchvision torchaudio - $ pip install opencv-python - $ pip install numpy --upgrade + $ sudo apt install -y python3-picamera2 python3-libcamera + $ pip install torch torchvision --break-system-packages .. image:: https://user-images.githubusercontent.com/909104/152874260-95a7a8bd-0f9b-438a-9c0b-5b67729e233f.png @@ -84,41 +85,49 @@ We can now check that everything installed correctly: Video Capture -~~~~~~~~~~~~~~ +------------------- -For video capture we're going to be using OpenCV to stream the video frames -instead of the more common ``picamera``. `picamera` isn't available on 64-bit -Raspberry Pi OS and it's much slower than OpenCV. OpenCV directly accesses the -``/dev/video0`` device to grab frames. +Test the camera is working first, by running ``libcamera-hello`` in a terminal. + +For video capture we're going to be using picamera2 to capture the video frames. The model we're using (MobileNetV2) takes in image sizes of ``224x224`` so we -can request that directly from OpenCV at 36fps. We're targeting 30fps for the +can request that directly from picamera2 at 36fps. We're targeting 30fps for the model but we request a slightly higher framerate than that so there's always enough frames. .. code:: python - import cv2 - from PIL import Image + from picamera2 import Picamera2 + + picam2 = Picamera2() + + # print available sensor modes + print(picam2.sensor_modes) - cap = cv2.VideoCapture(0) - cap.set(cv2.CAP_PROP_FRAME_WIDTH, 224) - cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 224) - cap.set(cv2.CAP_PROP_FPS, 36) + config = picam2.create_still_configuration(main={ + "size": (224, 224), + "format": "BGR888", + }, display="main") + picam2.configure(config) + picam2.set_controls({"FrameRate": 36}) + picam2.start() -OpenCV returns a ``numpy`` array in BGR so we need to read and do a bit of -shuffling to get it into the expected RGB format. +To capture the frames we can call ``capture_image`` to return a ``PIL.Image`` +object that we can use with PyTorch. .. code:: python - ret, image = cap.read() - # convert opencv output from BGR to RGB - image = image[:, :, [2, 1, 0]] + # read frame + image = picam2.capture_image("main") + + # show frame for testing + image.show() This data reading and processing takes about ``3.5 ms``. Image Preprocessing -~~~~~~~~~~~~~~~~~~~~ +---------------------- We need to take the frames and transform them into the format the model expects. This is the same processing as you would do on any machine with the standard torchvision transforms. @@ -139,7 +148,7 @@ We need to take the frames and transform them into the format the model expects. input_batch = input_tensor.unsqueeze(0) Model Choices -~~~~~~~~~~~~~~~ +---------------- There's a number of models you can choose from to use with different performance characteristics. Not all models provide a ``qnnpack`` pretrained variant so for @@ -178,7 +187,7 @@ Raspberry Pi 4 Benchmark Results: +--------------------+------+-----------------------+-----------------------+--------------------+ MobileNetV2: Quantization and JIT -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- For optimal performance we want a model that's quantized and fused. Quantized means that it does the computation using int8 which is much more performant than @@ -208,7 +217,7 @@ We then want to jit the model to reduce Python overhead and fuse any ops. Jit gi net = torch.jit.script(net) Putting It Together -~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------ We can now put all the pieces together and run it: @@ -217,18 +226,23 @@ We can now put all the pieces together and run it: import time import torch - import numpy as np from torchvision import models, transforms - - import cv2 - from PIL import Image + from picamera2 import Picamera2 torch.backends.quantized.engine = 'qnnpack' - cap = cv2.VideoCapture(0, cv2.CAP_V4L2) - cap.set(cv2.CAP_PROP_FRAME_WIDTH, 224) - cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 224) - cap.set(cv2.CAP_PROP_FPS, 36) + picam2 = Picamera2() + + # print available sensor modes + print(picam2.sensor_modes) + + config = picam2.create_still_configuration(main={ + "size": (224, 224), + "format": "BGR888", + }, display="main") + picam2.configure(config) + picam2.set_controls({"FrameRate": 36}) + picam2.start() preprocess = transforms.Compose([ transforms.ToTensor(), @@ -246,13 +260,8 @@ We can now put all the pieces together and run it: with torch.no_grad(): while True: # read frame - ret, image = cap.read() - if not ret: - raise RuntimeError("failed to read frame") + image = picam2.capture_image("main") - # convert opencv output from BGR to RGB - image = image[:, :, [2, 1, 0]] - permuted = image # preprocess input_tensor = preprocess(image) @@ -263,6 +272,7 @@ We can now put all the pieces together and run it: # run model output = net(input_batch) # do something with output ... + print(output.argmax()) # log model performance frame_count += 1 @@ -272,7 +282,8 @@ We can now put all the pieces together and run it: last_logged = now frame_count = 0 -Running it shows that we're hovering at ~30 fps. + +Running it shows that we're hovering at ~30 fps on a Raspberry Pi 4 and ~41 fps on a Raspberry Pi 5. .. image:: https://user-images.githubusercontent.com/909104/152892609-7d115705-3ec9-4f8d-beed-a51711503a32.png @@ -312,7 +323,7 @@ Detecting a mug: Troubleshooting: Performance -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------- PyTorch by default will use all of the cores available. If you have anything running in the background on the Raspberry Pi it may cause contention with the @@ -329,7 +340,7 @@ increases best case latency to ``72 ms`` from ``60 ms`` but eliminates the latency spikes of ``128 ms``. Next Steps -~~~~~~~~~~~~~ +------------ You can create your own model or fine tune an existing one. If you fine tune on one of the models from diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py index 30216ff880c..c707c0ebb19 100644 --- a/intermediate_source/reinforcement_ppo.py +++ b/intermediate_source/reinforcement_ppo.py @@ -25,12 +25,12 @@ We will cover six crucial components of TorchRL: -* `environments `__ -* `transforms `__ -* `models (policy and value function) `__ -* `loss modules `__ -* `data collectors `__ -* `replay buffers `__ +* `environments `__ +* `transforms `__ +* `models (policy and value function) `__ +* `loss modules `__ +* `data collectors `__ +* `replay buffers `__ """ @@ -419,8 +419,8 @@ in_keys=["loc", "scale"], distribution_class=TanhNormal, distribution_kwargs={ - "min": env.action_spec.space.low, - "max": env.action_spec.space.high, + "low": env.action_spec.space.low, + "high": env.action_spec.space.high, }, return_log_prob=True, # we'll need the log-prob for the numerator of the importance weights @@ -466,7 +466,7 @@ # Data collector # -------------- # -# TorchRL provides a set of `DataCollector classes `__. +# TorchRL provides a set of `DataCollector classes `__. # Briefly, these classes execute three operations: reset an environment, # compute an action given the latest observation, execute a step in the environment, # and repeat the last two steps until the environment signals a stop (or reaches @@ -551,7 +551,7 @@ # advantage_module = GAE( - gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True + gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True, device=device, ) loss_module = ClipPPOLoss( @@ -639,7 +639,7 @@ # number of steps (1000, which is our ``env`` horizon). # The ``rollout`` method of the ``env`` can take a policy as argument: # it will then execute this policy at each step. - with set_exploration_type(ExplorationType.MEAN), torch.no_grad(): + with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad(): # execute a rollout with the trained policy eval_rollout = env.rollout(1000, policy_module) logs["eval reward"].append(eval_rollout["next", "reward"].mean().item()) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 0ae3ea9a90c..1e50fcb3673 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -92,6 +92,24 @@ ) +# To ensure reproducibility during training, you can fix the random seeds +# by uncommenting the lines below. This makes the results consistent across +# runs, which is helpful for debugging or comparing different approaches. +# +# That said, allowing randomness can be beneficial in practice, as it lets +# the model explore different training trajectories. + + +# seed = 42 +# random.seed(seed) +# torch.manual_seed(seed) +# env.reset(seed=seed) +# env.action_space.seed(seed) +# env.observation_space.seed(seed) +# if torch.cuda.is_available(): +# torch.cuda.manual_seed(seed) + + ###################################################################### # Replay Memory # ------------- @@ -253,13 +271,15 @@ def forward(self, x): # EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay # TAU is the update rate of the target network # LR is the learning rate of the ``AdamW`` optimizer + BATCH_SIZE = 128 GAMMA = 0.99 EPS_START = 0.9 -EPS_END = 0.05 -EPS_DECAY = 1000 +EPS_END = 0.01 +EPS_DECAY = 2500 TAU = 0.005 -LR = 1e-4 +LR = 3e-4 + # Get number of actions from gym action space n_actions = env.action_space.n diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst index cf4716179e4..4c7739104cc 100644 --- a/intermediate_source/rpc_async_execution.rst +++ b/intermediate_source/rpc_async_execution.rst @@ -15,8 +15,7 @@ Prerequisites: This tutorial demonstrates how to build batch-processing RPC applications with the `@rpc.functions.async_execution `__ decorator, which helps to speed up training by reducing the number of blocked -RPC threads and consolidating CUDA operations on the callee. This shares the -same idea as `Batch Inference with TorchServe `__. +RPC threads and consolidating CUDA operations on the callee. .. note:: This tutorial requires PyTorch v1.6.0 or above. @@ -199,7 +198,7 @@ speed. Batch-Processing CartPole Solver -------------------------------- -This section uses CartPole-v1 from `OpenAI Gym `__ as +This section uses CartPole-v1 from OpenAI Gym as an example to show the performance impact of batch processing RPC. Please note that since the goal is to demonstrate the usage of `@rpc.functions.async_execution `__ diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst index 835e6f0649f..791ecf86d35 100644 --- a/intermediate_source/rpc_tutorial.rst +++ b/intermediate_source/rpc_tutorial.rst @@ -19,7 +19,7 @@ Source code of the two examples can be found in Previous tutorials, `Getting Started With Distributed Data Parallel `__ and `Writing Distributed Applications With PyTorch `__, -described `DistributedDataParallel `__ +described `DistributedDataParallel `__ which supports a specific training paradigm where the model is replicated across multiple processes and each process handles a split of the input data. Sometimes, you might run into scenarios that require different training @@ -59,7 +59,7 @@ Distributed Reinforcement Learning using RPC and RRef ----------------------------------------------------- This section describes steps to build a toy distributed reinforcement learning -model using RPC to solve CartPole-v1 from `OpenAI Gym `__. +model using RPC to solve CartPole-v1 from `OpenAI Gym `__. The policy code is mostly borrowed from the existing single-thread `example `__ as shown below. We will skip details of the ``Policy`` design, and focus on RPC @@ -156,7 +156,7 @@ send commands. Applications don't need to worry about the lifetime of ``RRefs``. The owner of each ``RRef`` maintains a reference counting map to track its lifetime, and guarantees the remote data object will not be deleted as long as there is any live user of that ``RRef``. Please refer to the ``RRef`` -`design doc `__ for details. +`design doc `__ for details. .. code:: python @@ -531,7 +531,7 @@ the given arguments (i.e., ``lr=0.05``). In the training loop, it first creates a distributed autograd context, which will help the distributed autograd engine to find gradients and involved RPC send/recv functions. The design details of the distributed autograd engine can -be found in its `design note `__. +be found in its `design note `__. Then, it kicks off the forward pass as if it is a local model, and run the distributed backward pass. For the distributed backward, you only need to specify a list of roots, in this case, it is the loss ``Tensor``. diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py index 666d240ece1..35b1ba7be4e 100644 --- a/intermediate_source/scaled_dot_product_attention_tutorial.py +++ b/intermediate_source/scaled_dot_product_attention_tutorial.py @@ -244,7 +244,7 @@ def generate_rand_batch( ###################################################################### # Using SDPA with ``torch.compile`` -# ================================= +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # With the release of PyTorch 2.0, a new feature called # ``torch.compile()`` has been introduced, which can provide @@ -324,9 +324,9 @@ def generate_rand_batch( # ###################################################################### -# Using SDPA with attn_bias subclasses` -# ========================================== -# +# Using SDPA with attn_bias subclasses +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses. # Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``. # The module is named ``torch.nn.attention.bias`` and contains the following two @@ -394,7 +394,7 @@ def generate_rand_batch( ###################################################################### # Conclusion -# ========== +# ~~~~~~~~~~~ # # In this tutorial, we have demonstrated the basic usage of # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py index c9e360d7518..5de4bb4ca3e 100755 --- a/intermediate_source/seq2seq_translation_tutorial.py +++ b/intermediate_source/seq2seq_translation_tutorial.py @@ -4,11 +4,15 @@ ******************************************************************************* **Author**: `Sean Robertson `_ -This is the third and final tutorial on doing "NLP From Scratch", where we +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + +This is the third and final tutorial on doing **NLP From Scratch**, where we write our own classes and functions to preprocess the data to do our NLP -modeling tasks. We hope after you complete this tutorial that you'll proceed to -learn how `torchtext` can handle much of this preprocessing for you in the -three tutorials immediately following this one. +modeling tasks. In this project we will be teaching a neural network to translate from French to English. diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py index 00bdcfbf079..3782ced18d9 100644 --- a/intermediate_source/tensorboard_profiler_tutorial.py +++ b/intermediate_source/tensorboard_profiler_tutorial.py @@ -4,6 +4,14 @@ This tutorial demonstrates how to use TensorBoard plugin with PyTorch Profiler to detect performance bottlenecks of the model. +.. warning:: + The TensorBoard integration with the PyTorch profiler is now + deprecated. Instead, use Perfetto or the Chrome trace to + view ``trace.json`` files. After + `generating a trace `__, + simply drag the ``trace.json`` into `Perfetto UI `__ + or ``chrome://tracing`` to visualize your profile. + Introduction ------------ PyTorch 1.8 includes an updated profiler API capable of diff --git a/intermediate_source/tensorboard_tutorial.rst b/intermediate_source/tensorboard_tutorial.rst index d62a12ba0e2..d599dd098c5 100644 --- a/intermediate_source/tensorboard_tutorial.rst +++ b/intermediate_source/tensorboard_tutorial.rst @@ -60,12 +60,9 @@ We'll begin with similar boilerplate code as in the `CIFAR-10 tutorial `_ should show the following. diff --git a/intermediate_source/tiatoolbox_tutorial.rst b/intermediate_source/tiatoolbox_tutorial.rst deleted file mode 100644 index dbaf3cdc464..00000000000 --- a/intermediate_source/tiatoolbox_tutorial.rst +++ /dev/null @@ -1,994 +0,0 @@ -Whole Slide Image Classification Using PyTorch and TIAToolbox -============================================================= - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. This will allow you to experiment with the information presented below. - - -Introduction ------------- - -In this tutorial, we will show how to classify Whole Slide Images (WSIs) -using PyTorch deep learning models with help from TIAToolbox. A WSI -is an image of a sample of human tissue taken through a surgery or biopsy and -scanned using specialized scanners. They are used by pathologists and -computational pathology researchers to `study diseases such as cancer at the microscopic -level `__ in -order to understand for example tumor growth and help improve treatment -for patients. - -What makes WSIs challenging to process is their enormous size. For -example, a typical slide image has in the order of `100,000x100,000 -pixels `__ where each pixel can -correspond to about 0.25x0.25 microns on the slide. This introduces -challenges in loading and processing such images, not to mention -hundreds or even thousands of WSIs in a single study (larger studies -produce better results)! - -Conventional image processing pipelines are not suitable for WSI -processing so we need better tools. This is where -`TIAToolbox `__ can -help as it brings a set of useful tools to import and process tissue -slides in a fast and computationally efficient manner. Typically, WSIs -are saved in a pyramid structure with multiple copies of the same image -at various magnification levels optimized for visualization. The level 0 -(or the bottom level) of the pyramid contains the image at the highest -magnification or zoom level, whereas the higher levels in the pyramid -have a lower resolution copy of the base image. The pyramid structure is -sketched below. - -|WSI pyramid stack| *WSI pyramid stack -(*\ `source `__\ *)* - -TIAToolbox allows us to automate common downstream analysis tasks such -as `tissue -classification `__. In this -tutorial we show how you can: 1. Load WSI images using -TIAToolbox; and 2. Use different PyTorch models to classify slides at -the patch-level. In this tutorial, we will provide an example of using -TorchVision ``ResNet18`` model and custom -`HistoEncoder` `__ model. - -Let’s get started! - -.. |WSI pyramid stack| image:: ../_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp - - -Setting up the environment --------------------------- - -To run the examples provided in this tutorial, the following packages -are required as prerequisites. - -1. OpenJpeg -2. OpenSlide -3. Pixman -4. TIAToolbox -5. HistoEncoder (for a custom model example) - -Please run the following command in your terminal to install these -packages: - - -`apt-get -y -qq install libopenjp2-7-dev libopenjp2-tools openslide-tools libpixman-1-dev` -`pip install -q 'tiatoolbox<1.5' histoencoder && echo "Installation is done."` - - -Alternatively, you can run ``brew install openjpeg openslide`` to -install the prerequisite packages on MacOS instead of ``apt-get``. -Further information on installation can be `found -here `__. - - - -Importing related libraries -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code-block:: python - - - """Import modules required to run the Jupyter notebook.""" - from __future__ import annotations - - # Configure logging - import logging - import warnings - if logging.getLogger().hasHandlers(): - logging.getLogger().handlers.clear() - warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") - - # Downloading data and files - import shutil - from pathlib import Path - from zipfile import ZipFile - - # Data processing and visualization - import matplotlib as mpl - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd - from matplotlib import cm - import PIL - import contextlib - import io - from sklearn.metrics import accuracy_score, confusion_matrix - - # TIAToolbox for WSI loading and processing - from tiatoolbox import logger - from tiatoolbox.models.architecture import vanilla - from tiatoolbox.models.engine.patch_predictor import ( - IOPatchPredictorConfig, - PatchPredictor, - ) - from tiatoolbox.utils.misc import download_data, grab_files_from_dir - from tiatoolbox.utils.visualization import overlay_prediction_mask - from tiatoolbox.wsicore.wsireader import WSIReader - - # Torch-related - import torch - from torchvision import transforms - - # Configure plotting - mpl.rcParams["figure.dpi"] = 160 # for high resolution figure in notebook - mpl.rcParams["figure.facecolor"] = "white" # To make sure text is visible in dark mode - - # If you are not using GPU, change ON_GPU to False - ON_GPU = True - - # Function to suppress console output for overly verbose code blocks - def suppress_console_output(): - return contextlib.redirect_stderr(io.StringIO()) - - - -Clean-up before a run -~~~~~~~~~~~~~~~~~~~~~ - -To ensure proper clean-up (for example in abnormal termination), all -files downloaded or created in this run are saved in a single directory -``global_save_dir``, which we set equal to “./tmp/”. To simplify -maintenance, the name of the directory occurs only at this one place, so -that it can easily be changed, if desired. - - - -.. code-block:: python - - - warnings.filterwarnings("ignore") - global_save_dir = Path("./tmp/") - - - def rmdir(dir_path: str | Path) -> None: - """Helper function to delete directory.""" - if Path(dir_path).is_dir(): - shutil.rmtree(dir_path) - logger.info("Removing directory %s", dir_path) - - - rmdir(global_save_dir) # remove directory if it exists from previous runs - global_save_dir.mkdir() - logger.info("Creating new directory %s", global_save_dir) - - - -Downloading the data -~~~~~~~~~~~~~~~~~~~~ - -For our sample data, we will use one whole-slide image, and patches from -the validation subset of `Kather -100k `__ dataset. - - - -.. code-block:: python - - - wsi_path = global_save_dir / "sample_wsi.svs" - patches_path = global_save_dir / "kather100k-validation-sample.zip" - weights_path = global_save_dir / "resnet18-kather100k.pth" - - logger.info("Download has started. Please wait...") - - # Downloading and unzip a sample whole-slide image - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/sample_wsis/TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.svs", - wsi_path, - ) - - # Download and unzip a sample of the validation set used to train the Kather 100K dataset - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/datasets/kather100k-validation-sample.zip", - patches_path, - ) - with ZipFile(patches_path, "r") as zipfile: - zipfile.extractall(path=global_save_dir) - - # Download pretrained model weights for WSI classification using ResNet18 architecture - download_data( - "https://tiatoolbox.dcs.warwick.ac.uk/models/pc/resnet18-kather100k.pth", - weights_path, - ) - - logger.info("Download is complete.") - - - -Reading the data ----------------- - -We create a list of patches and a list of corresponding labels. For -example, the first label in ``label_list`` will indicate the class of -the first image patch in ``patch_list``. - - - -.. code-block:: python - - - # Read the patch data and create a list of patches and a list of corresponding labels - dataset_path = global_save_dir / "kather100k-validation-sample" - - # Set the path to the dataset - image_ext = ".tif" # file extension of each image - - # Obtain the mapping between the label ID and the class name - label_dict = { - "BACK": 0, # Background (empty glass region) - "NORM": 1, # Normal colon mucosa - "DEB": 2, # Debris - "TUM": 3, # Colorectal adenocarcinoma epithelium - "ADI": 4, # Adipose - "MUC": 5, # Mucus - "MUS": 6, # Smooth muscle - "STR": 7, # Cancer-associated stroma - "LYM": 8, # Lymphocytes - } - - class_names = list(label_dict.keys()) - class_labels = list(label_dict.values()) - - # Generate a list of patches and generate the label from the filename - patch_list = [] - label_list = [] - for class_name, label in label_dict.items(): - dataset_class_path = dataset_path / class_name - patch_list_single_class = grab_files_from_dir( - dataset_class_path, - file_types="*" + image_ext, - ) - patch_list.extend(patch_list_single_class) - label_list.extend([label] * len(patch_list_single_class)) - - # Show some dataset statistics - plt.bar(class_names, [label_list.count(label) for label in class_labels]) - plt.xlabel("Patch types") - plt.ylabel("Number of patches") - - # Count the number of examples per class - for class_name, label in label_dict.items(): - logger.info( - "Class ID: %d -- Class Name: %s -- Number of images: %d", - label, - class_name, - label_list.count(label), - ) - - # Overall dataset statistics - logger.info("Total number of patches: %d", (len(patch_list))) - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png - :class: sphx-glr-single-img - - -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - |2023-11-14|13:15:59.299| [INFO] Class ID: 0 -- Class Name: BACK -- Number of images: 211 - |2023-11-14|13:15:59.299| [INFO] Class ID: 1 -- Class Name: NORM -- Number of images: 176 - |2023-11-14|13:15:59.299| [INFO] Class ID: 2 -- Class Name: DEB -- Number of images: 230 - |2023-11-14|13:15:59.299| [INFO] Class ID: 3 -- Class Name: TUM -- Number of images: 286 - |2023-11-14|13:15:59.299| [INFO] Class ID: 4 -- Class Name: ADI -- Number of images: 208 - |2023-11-14|13:15:59.299| [INFO] Class ID: 5 -- Class Name: MUC -- Number of images: 178 - |2023-11-14|13:15:59.299| [INFO] Class ID: 6 -- Class Name: MUS -- Number of images: 270 - |2023-11-14|13:15:59.299| [INFO] Class ID: 7 -- Class Name: STR -- Number of images: 209 - |2023-11-14|13:15:59.299| [INFO] Class ID: 8 -- Class Name: LYM -- Number of images: 232 - |2023-11-14|13:15:59.299| [INFO] Total number of patches: 2000 - - - -As you can see for this patch dataset, we have 9 classes/labels with IDs -0-8 and associated class names. describing the dominant tissue type in -the patch: - -- BACK ⟶ Background (empty glass region) -- LYM ⟶ Lymphocytes -- NORM ⟶ Normal colon mucosa -- DEB ⟶ Debris -- MUS ⟶ Smooth muscle -- STR ⟶ Cancer-associated stroma -- ADI ⟶ Adipose -- MUC ⟶ Mucus -- TUM ⟶ Colorectal adenocarcinoma epithelium - - - -Classify image patches ----------------------- - -We demonstrate how to obtain a prediction for each patch within a -digital slide first with the ``patch`` mode and then with a large slide -using ``wsi`` mode. - - -Define ``PatchPredictor`` model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The PatchPredictor class runs a CNN-based classifier written in PyTorch. - -- ``model`` can be any trained PyTorch model with the constraint that - it should follow the - ``tiatoolbox.models.abc.ModelABC`` `(docs)` `__ - class structure. For more information on this matter, please refer to - `our example notebook on advanced model - techniques `__. - In order to load a custom model, you need to write a small - preprocessing function, as in ``preproc_func(img)``, which makes sure - the input tensors are in the right format for the loaded network. -- Alternatively, you can pass ``pretrained_model`` as a string - argument. This specifies the CNN model that performs the prediction, - and it must be one of the models listed - `here `__. - The command will look like this: - ``predictor = PatchPredictor(pretrained_model='resnet18-kather100k', pretrained_weights=weights_path, batch_size=32)``. -- ``pretrained_weights``: When using a ``pretrained_model``, the - corresponding pretrained weights will also be downloaded by default. - You can override the default with your own set of weights via the - ``pretrained_weight`` argument. -- ``batch_size``: Number of images fed into the model each time. Higher - values for this parameter require a larger (GPU) memory capacity. - - - -.. code-block:: python - - - # Importing a pretrained PyTorch model from TIAToolbox - predictor = PatchPredictor(pretrained_model='resnet18-kather100k', batch_size=32) - - # Users can load any PyTorch model architecture instead using the following script - model = vanilla.CNNModel(backbone="resnet18", num_classes=9) # Importing model from torchvision.models.resnet18 - model.load_state_dict(torch.load(weights_path, map_location="cpu"), strict=True) - def preproc_func(img): - img = PIL.Image.fromarray(img) - img = transforms.ToTensor()(img) - return img.permute(1, 2, 0) - model.preproc_func = preproc_func - predictor = PatchPredictor(model=model, batch_size=32) - - - -Predict patch labels -~~~~~~~~~~~~~~~~~~~~ - -We create a predictor object and then call the ``predict`` method using -the ``patch`` mode. We then compute the classification accuracy and -confusion matrix. - - - -.. code-block:: python - - - with suppress_console_output(): - output = predictor.predict(imgs=patch_list, mode="patch", on_gpu=ON_GPU) - - acc = accuracy_score(label_list, output["predictions"]) - logger.info("Classification accuracy: %f", acc) - - # Creating and visualizing the confusion matrix for patch classification results - conf = confusion_matrix(label_list, output["predictions"], normalize="true") - df_cm = pd.DataFrame(conf, index=class_names, columns=class_names) - df_cm - - - - - - -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - |2023-11-14|13:16:03.215| [INFO] Classification accuracy: 0.993000 - - -.. raw:: html - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BACKNORMDEBTUMADIMUCMUSSTRLYM
BACK1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000
NORM0.0000000.9886360.0000000.0113640.0000000.0000000.0000000.0000000.00000
DEB0.0000000.0000000.9913040.0000000.0000000.0000000.0000000.0086960.00000
TUM0.0000000.0000000.0000000.9965030.0000000.0034970.0000000.0000000.00000
ADI0.0048080.0000000.0000000.0000000.9903850.0000000.0048080.0000000.00000
MUC0.0000000.0000000.0000000.0000000.0000000.9887640.0000000.0112360.00000
MUS0.0000000.0000000.0000000.0000000.0000000.0000000.9962960.0037040.00000
STR0.0000000.0000000.0047850.0000000.0000000.0047850.0047850.9856460.00000
LYM0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0043100.99569
-
-
-
-
- - -Predict patch labels for a whole slide -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We now introduce ``IOPatchPredictorConfig``, a class that specifies the -configuration of image reading and prediction writing for the model -prediction engine. This is required to inform the classifier which level -of the WSI pyramid the classifier should read, process data and generate -output. - -Parameters of ``IOPatchPredictorConfig`` are defined as: - -- ``input_resolutions``: A list, in the form of a dictionary, - specifying the resolution of each input. List elements must be in the - same order as in the target ``model.forward()``. If your model - accepts only one input, you just need to put one dictionary - specifying ``'units'`` and ``'resolution'``. Note that TIAToolbox - supports a model with more than one input. For more information on - units and resolution, please see `TIAToolbox - documentation `__. -- ``patch_input_shape``: Shape of the largest input in (height, width) - format. -- ``stride_shape``: The size of a stride (steps) between two - consecutive patches, used in the patch extraction process. If the - user sets ``stride_shape`` equal to ``patch_input_shape``, patches - will be extracted and processed without any overlap. - - - -.. code-block:: python - - - wsi_ioconfig = IOPatchPredictorConfig( - input_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_input_shape=[224, 224], - stride_shape=[224, 224], - ) - - - -The ``predict`` method applies the CNN on the input patches and get the -results. Here are the arguments and their descriptions: - -- ``mode``: Type of input to be processed. Choose from ``patch``, - ``tile`` or ``wsi`` according to your application. -- ``imgs``: List of inputs, which should be a list of paths to the - input tiles or WSIs. -- ``return_probabilities``: Set to **True** to get per class - probabilities alongside predicted labels of input patches. If you - wish to merge the predictions to generate prediction maps for - ``tile`` or ``wsi`` modes, you can set ``return_probabilities=True``. -- ``ioconfig``: set the IO configuration information using the - ``IOPatchPredictorConfig`` class. -- ``resolution`` and ``unit`` (not shown below): These arguments - specify the level or micron-per-pixel resolution of the WSI levels - from which we plan to extract patches and can be used instead of - ``ioconfig``. Here we specify the WSI level as ``'baseline'``, - which is equivalent to level 0. In general, this is the level of - greatest resolution. In this particular case, the image has only one - level. More information can be found in the - `documentation `__. -- ``masks``: A list of paths corresponding to the masks of WSIs in the - ``imgs`` list. These masks specify the regions in the original WSIs - from which we want to extract patches. If the mask of a particular - WSI is specified as ``None``, then the labels for all patches of that - WSI (even background regions) would be predicted. This could cause - unnecessary computation. -- ``merge_predictions``: You can set this parameter to ``True`` if it’s - required to generate a 2D map of patch classification results. - However, for large WSIs this will require large available memory. An - alternative (default) solution is to set ``merge_predictions=False``, - and then generate the 2D prediction maps using the - ``merge_predictions`` function as you will see later on. - -Since we are using a large WSI the patch extraction and prediction -processes may take some time (make sure to set the ``ON_GPU=True`` if -you have access to Cuda enabled GPU and PyTorch+Cuda). - - - -.. code-block:: python - - - with suppress_console_output(): - wsi_output = predictor.predict( - imgs=[wsi_path], - masks=None, - mode="wsi", - merge_predictions=False, - ioconfig=wsi_ioconfig, - return_probabilities=True, - save_dir=global_save_dir / "wsi_predictions", - on_gpu=ON_GPU, - ) - - - - -We see how the prediction model works on our whole-slide images by -visualizing the ``wsi_output``. We first need to merge patch prediction -outputs and then visualize them as an overlay on the original image. As -before, the ``merge_predictions`` method is used to merge the patch -predictions. Here we set the parameters -``resolution=1.25, units='power'`` to generate the prediction map at -1.25x magnification. If you would like to have higher/lower resolution -(bigger/smaller) prediction maps, you need to change these parameters -accordingly. When the predictions are merged, use the -``overlay_patch_prediction`` function to overlay the prediction map on -the WSI thumbnail, which should be extracted at the resolution used for -prediction merging. - - -.. code-block:: python - - - overview_resolution = ( - 4 # the resolution in which we desire to merge and visualize the patch predictions - ) - # the unit of the `resolution` parameter. Can be "power", "level", "mpp", or "baseline" - overview_unit = "mpp" - wsi = WSIReader.open(wsi_path) - wsi_overview = wsi.slide_thumbnail(resolution=overview_resolution, units=overview_unit) - plt.figure(), plt.imshow(wsi_overview) - plt.axis("off") - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png - :class: sphx-glr-single-img - - - -Overlaying the prediction map on this image as below gives: - - - -.. code-block:: python - - - # Visualization of whole-slide image patch-level prediction - # first set up a label to color mapping - label_color_dict = {} - label_color_dict[0] = ("empty", (0, 0, 0)) - colors = cm.get_cmap("Set1").colors - for class_name, label in label_dict.items(): - label_color_dict[label + 1] = (class_name, 255 * np.array(colors[label])) - - pred_map = predictor.merge_predictions( - wsi_path, - wsi_output[0], - resolution=overview_resolution, - units=overview_unit, - ) - overlay = overlay_prediction_mask( - wsi_overview, - pred_map, - alpha=0.5, - label_info=label_color_dict, - return_ax=True, - ) - plt.show() - - - - - -.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png - :class: sphx-glr-single-img - - - -Feature extraction with a pathology-specific model --------------------------------------------------- - -In this section, we will show how to extract features from a pretrained -PyTorch model that exists outside TIAToolbox, using the WSI inference -engines provided by TIAToolbox. To illustrate this we will use -HistoEncoder, a computational-pathology specific model that has been -trained in a self-supervised fashion to extract features from histology -images. The model has been made available here: - -‘HistoEncoder: Foundation models for digital pathology’ -(https://github.com/jopo666/HistoEncoder) by Pohjonen, Joona and team at -the University of Helsinki. - -We will plot a umap reduction into 3D (RGB) of the feature map to -visualize how the features capture the differences between some of the -above mentioned tissue types. - - - -.. code-block:: python - - - # Import some extra modules - import histoencoder.functional as F - import torch.nn as nn - - from tiatoolbox.models.engine.semantic_segmentor import DeepFeatureExtractor, IOSegmentorConfig - from tiatoolbox.models.models_abc import ModelABC - import umap - - - -TIAToolbox defines a ModelABC which is a class inheriting PyTorch -`nn.Module `__ -and specifies how a model should look in order to be used in the -TIAToolbox inference engines. The histoencoder model doesn’t follow this -structure, so we need to wrap it in a class whose output and methods are -those that the TIAToolbox engine expects. - - - -.. code-block:: python - - - class HistoEncWrapper(ModelABC): - """Wrapper for HistoEnc model that conforms to tiatoolbox ModelABC interface.""" - - def __init__(self: HistoEncWrapper, encoder) -> None: - super().__init__() - self.feat_extract = encoder - - def forward(self: HistoEncWrapper, imgs: torch.Tensor) -> torch.Tensor: - """Pass input data through the model. - - Args: - imgs (torch.Tensor): - Model input. - - """ - out = F.extract_features(self.feat_extract, imgs, num_blocks=2, avg_pool=True) - return out - - @staticmethod - def infer_batch( - model: nn.Module, - batch_data: torch.Tensor, - *, - on_gpu: bool, - ) -> list[np.ndarray]: - """Run inference on an input batch. - - Contains logic for forward operation as well as i/o aggregation. - - Args: - model (nn.Module): - PyTorch defined model. - batch_data (torch.Tensor): - A batch of data generated by - `torch.utils.data.DataLoader`. - on_gpu (bool): - Whether to run inference on a GPU. - - """ - img_patches_device = batch_data.to('cuda') if on_gpu else batch_data - model.eval() - # Do not compute the gradient (not training) - with torch.inference_mode(): - output = model(img_patches_device) - return [output.cpu().numpy()] - - - - -Now that we have our wrapper, we will create our feature extraction -model and instantiate a -`DeepFeatureExtractor `__ -to allow us to use this model over a WSI. We will use the same WSI as -above, but this time we will extract features from the patches of the -WSI using the HistoEncoder model, rather than predicting some label for -each patch. - - - -.. code-block:: python - - - # create the model - encoder = F.create_encoder("prostate_medium") - model = HistoEncWrapper(encoder) - - # set the pre-processing function - norm=transforms.Normalize(mean=[0.662, 0.446, 0.605],std=[0.169, 0.190, 0.155]) - trans = [ - transforms.ToTensor(), - norm, - ] - model.preproc_func = transforms.Compose(trans) - - wsi_ioconfig = IOSegmentorConfig( - input_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_input_shape=[224, 224], - output_resolutions=[{"units": "mpp", "resolution": 0.5}], - patch_output_shape=[224, 224], - stride_shape=[224, 224], - ) - - - -When we create the ``DeepFeatureExtractor``, we will pass the -``auto_generate_mask=True`` argument. This will automatically create a -mask of the tissue region using otsu thresholding, so that the extractor -processes only those patches containing tissue. - - - -.. code-block:: python - - - # create the feature extractor and run it on the WSI - extractor = DeepFeatureExtractor(model=model, auto_generate_mask=True, batch_size=32, num_loader_workers=4, num_postproc_workers=4) - with suppress_console_output(): - out = extractor.predict(imgs=[wsi_path], mode="wsi", ioconfig=wsi_ioconfig, save_dir=global_save_dir / "wsi_features",) - - - - -These features could be used to train a downstream model, but here in -order to get some intuition for what the features represent, we will use -a UMAP reduction to visualize the features in RGB space. The points -labeled in a similar color should have similar features, so we can check -if the features naturally separate out into the different tissue regions -when we overlay the UMAP reduction on the WSI thumbnail. We will plot it -along with the patch-level prediction map from above to see how the -features compare to the patch-level predictions in the following cells. - - - -.. code-block:: python - - - # First we define a function to calculate the umap reduction - def umap_reducer(x, dims=3, nns=10): - """UMAP reduction of the input data.""" - reducer = umap.UMAP(n_neighbors=nns, n_components=dims, metric="manhattan", spread=0.5, random_state=2) - reduced = reducer.fit_transform(x) - reduced -= reduced.min(axis=0) - reduced /= reduced.max(axis=0) - return reduced - - # load the features output by our feature extractor - pos = np.load(global_save_dir / "wsi_features" / "0.position.npy") - feats = np.load(global_save_dir / "wsi_features" / "0.features.0.npy") - pos = pos / 8 # as we extracted at 0.5mpp, and we are overlaying on a thumbnail at 4mpp - - # reduce the features into 3 dimensional (rgb) space - reduced = umap_reducer(feats) - - # plot the prediction map the classifier again - overlay = overlay_prediction_mask( - wsi_overview, - pred_map, - alpha=0.5, - label_info=label_color_dict, - return_ax=True, - ) - - # plot the feature map reduction - plt.figure() - plt.imshow(wsi_overview) - plt.scatter(pos[:,0], pos[:,1], c=reduced, s=1, alpha=0.5) - plt.axis("off") - plt.title("UMAP reduction of HistoEnc features") - plt.show() - - - - - -.. rst-class:: sphx-glr-horizontal - - - * - - .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png - :alt: tiatoolbox tutorial - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png - :class: sphx-glr-multi-img - - * - - .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png - :alt: UMAP reduction of HistoEnc features - :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png - :class: sphx-glr-multi-img - - - - -We see that the prediction map from our patch-level predictor, and the -feature map from our self-supervised feature encoder, capture similar -information about the tissue types in the WSI. This is a good sanity -check that our models are working as expected. It also shows that the -features extracted by the HistoEncoder model are capturing the -differences between the tissue types, and so that they are encoding -histologically relevant information. - - -Where to Go From Here ---------------------- - -In this notebook, we show how we can use the ``PatchPredictor`` and -``DeepFeatureExtractor`` classes and their ``predict`` method to predict -the label, or extract features, for patches of big tiles and WSIs. We -introduce ``merge_predictions`` and ``overlay_prediction_mask`` helper -functions that merge the patch prediction outputs and visualize the -resulting prediction map as an overlay on the input image/WSI. - -All the processes take place within TIAToolbox and we can easily put the -pieces together, following our example code. Please make sure to set -inputs and options correctly. We encourage you to further investigate -the effect on the prediction output of changing ``predict`` function -parameters. We have demonstrated how to use your own pretrained model or -one provided by the research community for a specific task in the -TIAToolbox framework to do inference on large WSIs even if the model -structure is not defined in the TIAToolbox model class. - -You can learn more through the following resources: - -- `Advanced model handling with PyTorch and - TIAToolbox `__ -- `Creating slide graphs for WSI with a custom PyTorch graph neural - network `__ - diff --git a/intermediate_source/torch_compile_conv_bn_fuser.py b/intermediate_source/torch_compile_conv_bn_fuser.py new file mode 100644 index 00000000000..e057d145499 --- /dev/null +++ b/intermediate_source/torch_compile_conv_bn_fuser.py @@ -0,0 +1,292 @@ +# -*- coding: utf-8 -*- +""" +Building a Convolution/Batch Norm fuser with torch.compile +=========================================================== + +**Author:** `Horace He `_, `Will Feng `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to register custom fusion patterns with torch.compile's pattern matcher + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.7.0 + +.. note:: + This optimization only works for models in inference mode (i.e. ``model.eval()``). + However, torch.compile's pattern matching system works for both training and inference. + +""" + + +###################################################################### +# First, let's get some imports out of the way (we will be using all +# of these later in the code). + +from typing import Type, Dict, Any, Tuple, Iterable +import copy +import torch +import torch.nn as nn + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +###################################################################### +# For this tutorial, we are going to create a model consisting of convolutions +# and batch norms. Note that this model has some tricky components - some of +# the conv/batch norm patterns are hidden within Sequentials and one of the +# ``BatchNorms`` is wrapped in another Module. + +class WrappedBatchNorm(nn.Module): + def __init__(self): + super().__init__() + self.mod = nn.BatchNorm2d(1) + def forward(self, x): + return self.mod(x) + +class M(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 1, 1) + self.bn1 = nn.BatchNorm2d(1) + self.conv2 = nn.Conv2d(1, 1, 1) + self.nested = nn.Sequential( + nn.BatchNorm2d(1), + nn.Conv2d(1, 1, 1), + ) + self.wrapped = WrappedBatchNorm() + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.conv2(x) + x = self.nested(x) + x = self.wrapped(x) + return x + +model = M().to(device) +model.eval() + +###################################################################### +# Fusing Convolution with Batch Norm +# ----------------------------------------- +# One of the primary challenges with trying to automatically fuse convolution +# and batch norm in PyTorch is that PyTorch does not provide an easy way of +# accessing the computational graph. torch.compile resolves this problem by +# capturing the computational graph during compilation, allowing us to apply +# pattern-based optimizations across the entire model, including operations +# nested within Sequential modules or wrapped in custom modules. +import torch._inductor.pattern_matcher as pm +from torch._inductor.pattern_matcher import register_replacement + +###################################################################### +# torch.compile will capture a graph representation of our model. During +# compilation, modules hidden within Sequential containers and wrapped +# modules are all inlined into the graph, making them available for +# pattern matching and optimization. + + +#################################### +# Fusing Convolution with Batch Norm +# ---------------------------------- +# Unlike some other fusions, fusion of convolution with batch norm does not +# require any new operators. Instead, as batch norm during inference +# consists of a pointwise add and multiply, these operations can be "baked" +# into the preceding convolution's weights. This allows us to remove the batch +# norm entirely from our model! Read +# https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ for further details. The +# code here is copied from +# https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/nn/utils/fusion.py +# clarity purposes. +def fuse_conv_bn_eval(conv, bn): + """ + Given a conv Module `A` and an batch_norm module `B`, returns a conv + module `C` such that C(x) == B(A(x)) in inference mode. + """ + assert(not (conv.training or bn.training)), "Fusion only for eval!" + fused_conv = copy.deepcopy(conv) + + fused_conv.weight, fused_conv.bias = \ + fuse_conv_bn_weights(fused_conv.weight, fused_conv.bias, + bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) + + return fused_conv + +def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): + if conv_b is None: + conv_b = torch.zeros_like(bn_rm) + if bn_w is None: + bn_w = torch.ones_like(bn_rm) + if bn_b is None: + bn_b = torch.zeros_like(bn_rm) + bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps) + + conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) + conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b + + return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b) + + +#################################### +# Pattern Matching with torch.compile +# ------------------------------------ +# Now that we have our fusion logic, we need to register a pattern that +# torch.compile's pattern matcher will recognize and replace during +# compilation. + +# Define the pattern we want to match: conv2d followed by batch_norm +def conv_bn_pattern(x, conv_weight, conv_bias, bn_mean, bn_var, bn_weight, bn_bias): + conv_out = torch.nn.functional.conv2d(x, conv_weight, conv_bias) + bn_out = torch.nn.functional.batch_norm( + conv_out, bn_mean, bn_var, bn_weight, bn_bias, + training=False, eps=1e-5 + ) + return bn_out + +def conv_bn_replacement(x, conv_weight, conv_bias, bn_mean, bn_var, bn_weight, bn_bias): + fused_weight, fused_bias = fuse_conv_bn_weights( + conv_weight, conv_bias, bn_mean, bn_var, 1e-5, bn_weight, bn_bias + ) + return torch.nn.functional.conv2d(x, fused_weight, fused_bias) + +# Example inputs are needed to trace the pattern functions. +# The inputs should match the function signatures of conv_bn_pattern and conv_bn_replacement. +# These are used to trace the pattern functions to create the match template. +# IMPORTANT: The pattern matcher is shape-agnostic! The specific shapes you use here +# don't limit what shapes will be matched - any valid conv2d->batch_norm sequence +# will be matched regardless of channels, kernel size, or spatial dimensions. +# - x: input tensor (batch_size, channels, height, width) +# - conv_weight: (out_channels, in_channels, kernel_h, kernel_w) +# - conv_bias: (out_channels,) +# - bn_mean, bn_var, bn_weight, bn_bias: all have shape (num_features,) matching out_channels +example_inputs = [ + torch.randn(1, 1, 4, 4).to(device), # x: input tensor + torch.randn(1, 1, 1, 1).to(device), # conv_weight: 1 output channel, 1 input channel, 1x1 kernel + torch.randn(1).to(device), # conv_bias: 1 output channel + torch.randn(1).to(device), # bn_mean: batch norm running mean + torch.randn(1).to(device), # bn_var: batch norm running variance + torch.randn(1).to(device), # bn_weight: batch norm weight (gamma) + torch.randn(1).to(device), # bn_bias: batch norm bias (beta) +] + +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._inductor import config + +# Create a pattern matcher pass and register our pattern +patterns = PatternMatcherPass() + +register_replacement( + conv_bn_pattern, + conv_bn_replacement, + example_inputs, + pm.fwd_only, + patterns, +) + +# Create a custom pass function that applies our patterns +def conv_bn_fusion_pass(graph): + return patterns.apply(graph) + +# Set our custom pass in the config +config.post_grad_custom_post_pass = conv_bn_fusion_pass + + +###################################################################### +# .. note:: +# We make some simplifications here for demonstration purposes, such as only +# matching 2D convolutions. The pattern matcher in torch.compile +# can handle more complex patterns. + +###################################################################### +# Testing out our Fusion Pass +# ----------------------------------------- +# We can now run this fusion pass on our initial toy model and verify that our +# results are identical. In addition, we can print out the code for our fused +# model and verify that there are no more batch norms. + +from torch._dynamo.utils import counters + +# Clear the counters before compilation +counters.clear() + +# Ensure pattern matcher is enabled +config.pattern_matcher = True + +fused_model = torch.compile(model, backend="inductor") +inp = torch.randn(5, 1, 1, 1).to(device) + +# Run the model to trigger compilation and pattern matching +with torch.no_grad(): + output = fused_model(inp) + expected = model(inp) + torch.testing.assert_close(output, expected) + +# Check how many patterns were matched +assert counters['inductor']['pattern_matcher_count'] == 3, "Expected 3 conv-bn patterns to be matched" + +# Create a model with different shapes than our example_inputs +test_model_diff_shape = nn.Sequential( + nn.Conv2d(3, 16, 5), + nn.BatchNorm2d(16), + nn.ReLU(), + nn.Conv2d(16, 32, 7), + nn.BatchNorm2d(32), +).to(device).eval() + +counters.clear() +compiled_diff_shape = torch.compile(test_model_diff_shape, backend="inductor") +test_input_diff_shape = torch.randn(1, 3, 28, 28).to(device) +with torch.no_grad(): + compiled_diff_shape(test_input_diff_shape) + +# Check how many patterns were matched +assert counters['inductor']['pattern_matcher_count'] == 2, "Expected 2 conv-bn patterns to be matched" + + +###################################################################### +# Benchmarking our Fusion on ResNet18 +# ----------------------------------- +# We can test our fusion pass on a larger model like ResNet18 and see how much +# this pass improves inference performance. +import torchvision.models as models +import time + +rn18 = models.resnet18().to(device) +rn18.eval() + +inp = torch.randn(10, 3, 224, 224).to(device) +output = rn18(inp) + +def benchmark(model, iters=20): + with torch.no_grad(): + for _ in range(10): + model(inp) + begin = time.time() + for _ in range(iters): + model(inp) + return str(time.time()-begin) + +# Benchmark original model +print("Original model time: ", benchmark(rn18)) + +# Compile with our custom pattern +compiled_with_pattern_matching = torch.compile(rn18, backend="inductor") + +# Benchmark compiled model +print("\ntorch.compile (with conv-bn pattern matching and other fusions): ", benchmark(compiled_with_pattern_matching)) + + +############ +# Conclusion +# ---------- +# As we can see, torch.compile provides a powerful way to implement +# graph transformations and optimizations through pattern matching. +# By registering custom patterns, we can extend torch.compile's +# optimization capabilities to handle domain-specific transformations. +# +# The conv-bn fusion demonstrated here is just one example of what's +# possible with torch.compile's pattern matching system. \ No newline at end of file diff --git a/intermediate_source/torch_compile_full_example.py b/intermediate_source/torch_compile_full_example.py new file mode 100644 index 00000000000..d1967b9d63c --- /dev/null +++ b/intermediate_source/torch_compile_full_example.py @@ -0,0 +1,240 @@ +# -*- coding: utf-8 -*- + +""" +``torch.compile`` End-to-End Tutorial +================================= +**Author:** William Wen +""" + +###################################################################### +# ``torch.compile`` is the new way to speed up your PyTorch code! +# ``torch.compile`` makes PyTorch code run faster by +# JIT-compiling PyTorch code into optimized kernels, +# while requiring minimal code changes. +# +# This tutorial covers an end-to-end example of training and evaluating a +# real model with ``torch.compile``. For a gentle introduction to ``torch.compile``, +# please check out `the introduction to torch.compile tutorial `__. +# +# **Required pip Dependencies** +# +# - ``torch >= 2.0`` +# - ``torchvision`` +# +# .. grid:: 2 +# +# .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn +# :class-card: card-prerequisites +# +# * How to apply ``torch.compile`` to a real model +# * ``torch.compile`` speedups on a real model +# * ``torch.compile``'s first few iterations are expected to be slower due to compilation overhead +# +# .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites +# :class-card: card-prerequisites +# +# * `Introduction to torch.compile `__ + +# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in +# order to reproduce the speedup numbers shown below and documented elsewhere. + +import torch +import warnings + +gpu_ok = False +if torch.cuda.is_available(): + device_cap = torch.cuda.get_device_capability() + if device_cap in ((7, 0), (8, 0), (9, 0)): + gpu_ok = True + +if not gpu_ok: + warnings.warn( + "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " + "than expected." + ) + + +###################################################################### +# Let's demonstrate how using ``torch.compile`` can speed up a real model. +# We will compare standard eager mode and +# ``torch.compile`` by evaluating and training a ``torchvision`` model on random data. +# +# Before we start, we need to define some utility functions. + + +# Returns the result of running `fn()` and the time it took for `fn()` to run, +# in seconds. We use CUDA events and synchronization for the most accurate +# measurements. +def timed(fn): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + result = fn() + end.record() + torch.cuda.synchronize() + return result, start.elapsed_time(end) / 1000 + + +# Generates random input and targets data for the model, where `b` is +# batch size. +def generate_data(b): + return ( + torch.randn(b, 3, 128, 128).to().cuda(), + torch.randint(1000, (b,)).cuda(), + ) + + +N_ITERS = 10 + +from torchvision.models import densenet121 + + +def init_model(): + return densenet121().cuda() + + +###################################################################### +# First, let's compare inference. +# +# Note that in the call to ``torch.compile``, we have the additional +# ``mode`` argument, which we will discuss below. + +model = init_model() + +# Note that we generally recommend directly compiling a torch.nn.Module by calling +# its .compile() method. +model_opt = init_model() +model_opt.compile(mode="reduce-overhead") + +inp = generate_data(16)[0] +with torch.no_grad(): + print("eager:", timed(lambda: model(inp))[1]) + print("compile:", timed(lambda: model_opt(inp))[1]) + +###################################################################### +# Notice that ``torch.compile`` takes a lot longer to complete +# compared to eager. This is because ``torch.compile`` compiles +# the model into optimized kernels as it executes. In our example, the +# structure of the model doesn't change, and so recompilation is not +# needed. So if we run our optimized model several more times, we should +# see a significant improvement compared to eager. + +eager_times = [] +for i in range(N_ITERS): + inp = generate_data(16)[0] + with torch.no_grad(): + _, eager_time = timed(lambda: model(inp)) + eager_times.append(eager_time) + print(f"eager eval time {i}: {eager_time}") + +print("~" * 10) + +compile_times = [] +for i in range(N_ITERS): + inp = generate_data(16)[0] + with torch.no_grad(): + _, compile_time = timed(lambda: model_opt(inp)) + compile_times.append(compile_time) + print(f"compile eval time {i}: {compile_time}") +print("~" * 10) + +import numpy as np + +eager_med = np.median(eager_times) +compile_med = np.median(compile_times) +speedup = eager_med / compile_med +assert speedup > 1 +print( + f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" +) +print("~" * 10) + +###################################################################### +# And indeed, we can see that running our model with ``torch.compile`` +# results in a significant speedup. Speedup mainly comes from reducing Python overhead and +# GPU read/writes, and so the observed speedup may vary on factors such as model +# architecture and batch size. For example, if a model's architecture is simple +# and the amount of data is large, then the bottleneck would be +# GPU compute and the observed speedup may be less significant. +# +# You may also see different speedup results depending on the chosen ``mode`` +# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce +# the overhead of Python. For your own models, +# you may need to experiment with different modes to maximize speedup. You can +# read more about modes `here `__. +# +# You may might also notice that the second time we run our model with ``torch.compile`` is significantly +# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"`` +# mode runs a few warm-up iterations for CUDA graphs. +# +# Now, let's consider comparing training. + +model = init_model() +opt = torch.optim.Adam(model.parameters()) + + +def train(mod, data): + opt.zero_grad(True) + pred = mod(data[0]) + loss = torch.nn.CrossEntropyLoss()(pred, data[1]) + loss.backward() + opt.step() + + +eager_times = [] +for i in range(N_ITERS): + inp = generate_data(16) + _, eager_time = timed(lambda: train(model, inp)) + eager_times.append(eager_time) + print(f"eager train time {i}: {eager_time}") +print("~" * 10) + +model = init_model() +opt = torch.optim.Adam(model.parameters()) + +# Note that because we are compiling a regular Python function, we do not +# call any .compile() method. +train_opt = torch.compile(train, mode="reduce-overhead") + +compile_times = [] +for i in range(N_ITERS): + inp = generate_data(16) + _, compile_time = timed(lambda: train_opt(model, inp)) + compile_times.append(compile_time) + print(f"compile train time {i}: {compile_time}") +print("~" * 10) + +eager_med = np.median(eager_times) +compile_med = np.median(compile_times) +speedup = eager_med / compile_med +assert speedup > 1 +print( + f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" +) +print("~" * 10) + +###################################################################### +# Again, we can see that ``torch.compile`` takes longer in the first +# iteration, as it must compile the model, but in subsequent iterations, we see +# significant speedups compared to eager. +# +# We remark that the speedup numbers presented in this tutorial are for +# demonstration purposes only. Official speedup values can be seen at the +# `TorchInductor performance dashboard `__. + +###################################################################### +# Conclusion +# ------------ +# +# In this tutorial, we applied ``torch.compile`` to training and inference on a real model, +# demonstrating speedups. +# +# Importantly, we note that the first few iterations of a compiled model +# are slower than eager mode due to compilation overhead, but subsequent iterations are expected to +# have speedups. +# +# For a gentle introduction to ``torch.compile``, please check out `the introduction to torch.compile tutorial `__. +# +# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the torch.compile programming model `__. +# +# We hope that you will give ``torch.compile`` a try! diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index 5e7112f5b93..b600d242e50 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -7,104 +7,151 @@ """ ###################################################################### -# ``torch.compile`` is the latest method to speed up your PyTorch code! +# ``torch.compile`` is the new way to speed up your PyTorch code! # ``torch.compile`` makes PyTorch code run faster by # JIT-compiling PyTorch code into optimized kernels, -# all while requiring minimal code changes. +# while requiring minimal code changes. # -# In this tutorial, we cover basic ``torch.compile`` usage, -# and demonstrate the advantages of ``torch.compile`` over -# previous PyTorch compiler solutions, such as -# `TorchScript `__ and -# `FX Tracing `__. +# ``torch.compile`` accomplishes this by tracing through +# your Python code, looking for PyTorch operations. +# Code that is difficult to trace will result a +# **graph break**, which are lost optimization opportunities, rather +# than errors or silent incorrectness. +# +# ``torch.compile`` is available in PyTorch 2.0 and later. +# +# This introduction covers basic ``torch.compile`` usage +# and demonstrates the advantages of ``torch.compile`` over +# our previous PyTorch compiler solution, +# `TorchScript `__. +# +# For an end-to-end example on a real model, check out our `end-to-end torch.compile tutorial `__. +# +# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the torch.compile programming model `__. # # **Contents** # # .. contents:: # :local: # -# **Required pip Dependencies** +# **Required pip dependencies for this tutorial** # # - ``torch >= 2.0`` -# - ``torchvision`` # - ``numpy`` # - ``scipy`` -# - ``tabulate`` +# +# **System requirements** +# - A C++ compiler, such as ``g++`` +# - Python development package (``python-devel``/``python-dev``) ###################################################################### -# NOTE: a modern NVIDIA GPU (H100, A100, or V100) is recommended for this tutorial in -# order to reproduce the speedup numbers shown below and documented elsewhere. +# Basic Usage +# ------------ +# +# We turn on some logging to help us to see what ``torch.compile`` is doing +# under the hood in this tutorial. +# The following code will print out the PyTorch ops that ``torch.compile`` traced. import torch -import warnings -gpu_ok = False -if torch.cuda.is_available(): - device_cap = torch.cuda.get_device_capability() - if device_cap in ((7, 0), (8, 0), (9, 0)): - gpu_ok = True +# sphinx_gallery_start_ignore +# to clear torch logs format +import os +os.environ["TORCH_LOGS_FORMAT"] = "" +torch._logging._internal.DEFAULT_FORMATTER = ( + torch._logging._internal._default_formatter() +) +torch._logging._internal._init_logs() +# sphinx_gallery_end_ignore -if not gpu_ok: - warnings.warn( - "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " - "than expected." - ) +torch._logging.set_logs(graph_code=True) ###################################################################### -# Basic Usage -# ------------ -# -# ``torch.compile`` is included in the latest PyTorch. -# Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly -# binary. If Triton is still missing, try installing ``torchtriton`` via pip -# (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"`` -# for CUDA 11.7). -# -# Arbitrary Python functions can be optimized by passing the callable to -# ``torch.compile``. We can then call the returned optimized -# function in place of the original function. +# ``torch.compile`` is a decorator that takes an arbitrary Python function. + def foo(x, y): a = torch.sin(x) b = torch.cos(y) return a + b + + opt_foo1 = torch.compile(foo) -print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10))) +print(opt_foo1(torch.randn(3, 3), torch.randn(3, 3))) -###################################################################### -# Alternatively, we can decorate the function. @torch.compile def opt_foo2(x, y): a = torch.sin(x) b = torch.cos(y) return a + b -print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10))) + + +print(opt_foo2(torch.randn(3, 3), torch.randn(3, 3))) + +###################################################################### +# ``torch.compile`` is applied recursively, so nested function calls +# within the top-level compiled function will also be compiled. + + +def inner(x): + return torch.sin(x) + + +@torch.compile +def outer(x, y): + a = inner(x) + b = torch.cos(y) + return a + b + + +print(outer(torch.randn(3, 3), torch.randn(3, 3))) + ###################################################################### -# We can also optimize ``torch.nn.Module`` instances. +# We can also optimize ``torch.nn.Module`` instances by either calling +# its ``.compile()`` method or by directly ``torch.compile``-ing the module. +# This is equivalent to ``torch.compile``-ing the module's ``__call__`` method +# (which indirectly calls ``forward``). + +t = torch.randn(10, 100) + class MyModule(torch.nn.Module): def __init__(self): super().__init__() - self.lin = torch.nn.Linear(100, 10) + self.lin = torch.nn.Linear(3, 3) def forward(self, x): return torch.nn.functional.relu(self.lin(x)) -mod = MyModule() -opt_mod = torch.compile(mod) -print(opt_mod(torch.randn(10, 100))) + +mod1 = MyModule() +mod1.compile() +print(mod1(torch.randn(3, 3))) + +mod2 = MyModule() +mod2 = torch.compile(mod2) +print(mod2(torch.randn(3, 3))) + ###################################################################### # Demonstrating Speedups # ----------------------- # -# Let's now demonstrate that using ``torch.compile`` can speed -# up real models. We will compare standard eager mode and -# ``torch.compile`` by evaluating and training a ``torchvision`` model on random data. -# -# Before we start, we need to define some utility functions. +# Now let's demonstrate how ``torch.compile`` speeds up a simple PyTorch example. +# For a demonstration on a more complex model, see our `end-to-end torch.compile tutorial `__. + + +def foo3(x): + y = x + 1 + z = torch.nn.functional.relu(y) + u = z * 2 + return u + + +opt_foo3 = torch.compile(foo3) + # Returns the result of running `fn()` and the time it took for `fn()` to run, # in seconds. We use CUDA events and synchronization for the most accurate @@ -116,74 +163,47 @@ def timed(fn): result = fn() end.record() torch.cuda.synchronize() - return result, start.elapsed_time(end) / 1000 + return result, start.elapsed_time(end) / 1024 -# Generates random input and targets data for the model, where `b` is -# batch size. -def generate_data(b): - return ( - torch.randn(b, 3, 128, 128).to(torch.float32).cuda(), - torch.randint(1000, (b,)).cuda(), - ) -N_ITERS = 10 - -from torchvision.models import densenet121 -def init_model(): - return densenet121().to(torch.float32).cuda() +inp = torch.randn(4096, 4096).cuda() +print("compile:", timed(lambda: opt_foo3(inp))[1]) +print("eager:", timed(lambda: foo3(inp))[1]) ###################################################################### -# First, let's compare inference. -# -# Note that in the call to ``torch.compile``, we have the additional -# ``mode`` argument, which we will discuss below. - -model = init_model() - -# Reset since we are using a different mode. -import torch._dynamo -torch._dynamo.reset() - -model_opt = torch.compile(model, mode="reduce-overhead") - -inp = generate_data(16)[0] -with torch.no_grad(): - print("eager:", timed(lambda: model(inp))[1]) - print("compile:", timed(lambda: model_opt(inp))[1]) - -###################################################################### -# Notice that ``torch.compile`` takes a lot longer to complete -# compared to eager. This is because ``torch.compile`` compiles -# the model into optimized kernels as it executes. In our example, the -# structure of the model doesn't change, and so recompilation is not -# needed. So if we run our optimized model several more times, we should +# Notice that ``torch.compile`` appears to take a lot longer to complete +# compared to eager. This is because ``torch.compile`` takes extra time to compile +# the model on the first few executions. +# ``torch.compile`` re-uses compiled code whever possible, +# so if we run our optimized model several more times, we should # see a significant improvement compared to eager. +# turn off logging for now to prevent spam +torch._logging.set_logs(graph_code=False) + eager_times = [] -for i in range(N_ITERS): - inp = generate_data(16)[0] - with torch.no_grad(): - _, eager_time = timed(lambda: model(inp)) +for i in range(10): + _, eager_time = timed(lambda: foo3(inp)) eager_times.append(eager_time) - print(f"eager eval time {i}: {eager_time}") - + print(f"eager time {i}: {eager_time}") print("~" * 10) compile_times = [] -for i in range(N_ITERS): - inp = generate_data(16)[0] - with torch.no_grad(): - _, compile_time = timed(lambda: model_opt(inp)) +for i in range(10): + _, compile_time = timed(lambda: opt_foo3(inp)) compile_times.append(compile_time) - print(f"compile eval time {i}: {compile_time}") + print(f"compile time {i}: {compile_time}") print("~" * 10) import numpy as np + eager_med = np.median(eager_times) compile_med = np.median(compile_times) speedup = eager_med / compile_med -assert(speedup > 1) -print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") +assert speedup > 1 +print( + f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x" +) print("~" * 10) ###################################################################### @@ -194,151 +214,72 @@ def init_model(): # and the amount of data is large, then the bottleneck would be # GPU compute and the observed speedup may be less significant. # -# You may also see different speedup results depending on the chosen ``mode`` -# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce -# the overhead of Python. For your own models, -# you may need to experiment with different modes to maximize speedup. You can -# read more about modes `here `__. -# -# You may might also notice that the second time we run our model with ``torch.compile`` is significantly -# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"`` -# mode runs a few warm-up iterations for CUDA graphs. -# -# For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed`` -# function we defined above. We wrote our own timing function in this tutorial to show -# ``torch.compile``'s compilation latency. -# -# Now, let's consider comparing training. - -model = init_model() -opt = torch.optim.Adam(model.parameters()) - -def train(mod, data): - opt.zero_grad(True) - pred = mod(data[0]) - loss = torch.nn.CrossEntropyLoss()(pred, data[1]) - loss.backward() - opt.step() - -eager_times = [] -for i in range(N_ITERS): - inp = generate_data(16) - _, eager_time = timed(lambda: train(model, inp)) - eager_times.append(eager_time) - print(f"eager train time {i}: {eager_time}") -print("~" * 10) - -model = init_model() -opt = torch.optim.Adam(model.parameters()) -train_opt = torch.compile(train, mode="reduce-overhead") - -compile_times = [] -for i in range(N_ITERS): - inp = generate_data(16) - _, compile_time = timed(lambda: train_opt(model, inp)) - compile_times.append(compile_time) - print(f"compile train time {i}: {compile_time}") -print("~" * 10) - -eager_med = np.median(eager_times) -compile_med = np.median(compile_times) -speedup = eager_med / compile_med -assert(speedup > 1) -print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") -print("~" * 10) - -###################################################################### -# Again, we can see that ``torch.compile`` takes longer in the first -# iteration, as it must compile the model, but in subsequent iterations, we see -# significant speedups compared to eager. -# -# We remark that the speedup numbers presented in this tutorial are for -# demonstration purposes only. Official speedup values can be seen at the -# `TorchInductor performance dashboard `__. +# To see speedups on a real model, check out our `end-to-end torch.compile tutorial `__. ###################################################################### -# Comparison to TorchScript and FX Tracing -# ----------------------------------------- +# Benefits over TorchScript +# ------------------------- # -# We have seen that ``torch.compile`` can speed up PyTorch code. -# Why else should we use ``torch.compile`` over existing PyTorch -# compiler solutions, such as TorchScript or FX Tracing? Primarily, the +# Why should we use ``torch.compile`` over TorchScript? Primarily, the # advantage of ``torch.compile`` lies in its ability to handle # arbitrary Python code with minimal changes to existing code. # -# One case that ``torch.compile`` can handle that other compiler -# solutions struggle with is data-dependent control flow (the -# ``if x.sum() < 0:`` line below). +# Compare to TorchScript, which has a tracing mode (``torch.jit.trace``) and +# a scripting mode (``torch.jit.script``). Tracing mode is susceptible to +# silent incorrectness, while scripting mode requires significant code changes +# and will raise errors on unsupported Python code. +# +# For example, TorchScript tracing silently fails on data-dependent control flow +# (the ``if x.sum() < 0:`` line below) +# because only the actual control flow path is traced. +# In comparison, ``torch.compile`` is able to correctly handle it. + def f1(x, y): if x.sum() < 0: return -y return y -# Test that `fn1` and `fn2` return the same result, given -# the same arguments `args`. Typically, `fn1` will be an eager function -# while `fn2` will be a compiled function (torch.compile, TorchScript, or FX graph). + +# Test that `fn1` and `fn2` return the same result, given the same arguments `args`. def test_fns(fn1, fn2, args): out1 = fn1(*args) out2 = fn2(*args) return torch.allclose(out1, out2) + inp1 = torch.randn(5, 5) inp2 = torch.randn(5, 5) -###################################################################### -# TorchScript tracing ``f1`` results in -# silently incorrect results, since only the actual control flow path -# is traced. - traced_f1 = torch.jit.trace(f1, (inp1, inp2)) print("traced 1, 1:", test_fns(f1, traced_f1, (inp1, inp2))) print("traced 1, 2:", test_fns(f1, traced_f1, (-inp1, inp2))) -###################################################################### -# FX tracing ``f1`` results in an error due to the presence of -# data-dependent control flow. - -import traceback as tb -try: - torch.fx.symbolic_trace(f1) -except: - tb.print_exc() - -###################################################################### -# If we provide a value for ``x`` as we try to FX trace ``f1``, then -# we run into the same problem as TorchScript tracing, as the data-dependent -# control flow is removed in the traced function. - -fx_f1 = torch.fx.symbolic_trace(f1, concrete_args={"x": inp1}) -print("fx 1, 1:", test_fns(f1, fx_f1, (inp1, inp2))) -print("fx 1, 2:", test_fns(f1, fx_f1, (-inp1, inp2))) - -###################################################################### -# Now we can see that ``torch.compile`` correctly handles -# data-dependent control flow. - -# Reset since we are using a different mode. -torch._dynamo.reset() - compile_f1 = torch.compile(f1) print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2))) print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2))) print("~" * 10) ###################################################################### -# TorchScript scripting can handle data-dependent control flow, but this -# solution comes with its own set of problems. Namely, TorchScript scripting -# can require major code changes and will raise errors when unsupported Python +# TorchScript scripting can handle data-dependent control flow, +# but it can require major code changes and will raise errors when unsupported Python # is used. # # In the example below, we forget TorchScript type annotations and we receive # a TorchScript error because the input type for argument ``y``, an ``int``, # does not match with the default argument type, ``torch.Tensor``. +# In comparison, ``torch.compile`` works without requiring any type annotations. + + +import traceback as tb + +torch._logging.set_logs(graph_code=True) + def f2(x, y): return x + y + inp1 = torch.randn(5, 5) inp2 = 3 @@ -348,86 +289,24 @@ def f2(x, y): except: tb.print_exc() -###################################################################### -# However, ``torch.compile`` is easily able to handle ``f2``. - compile_f2 = torch.compile(f2) print("compile 2:", test_fns(f2, compile_f2, (inp1, inp2))) print("~" * 10) ###################################################################### -# Another case that ``torch.compile`` handles well compared to -# previous compilers solutions is the usage of non-PyTorch functions. - -import scipy -def f3(x): - x = x * 2 - x = scipy.fft.dct(x.numpy()) - x = torch.from_numpy(x) - x = x * 2 - return x - -###################################################################### -# TorchScript tracing treats results from non-PyTorch function calls -# as constants, and so our results can be silently wrong. - -inp1 = torch.randn(5, 5) -inp2 = torch.randn(5, 5) -traced_f3 = torch.jit.trace(f3, (inp1,)) -print("traced 3:", test_fns(f3, traced_f3, (inp2,))) - -###################################################################### -# TorchScript scripting and FX tracing disallow non-PyTorch function calls. - -try: - torch.jit.script(f3) -except: - tb.print_exc() - -try: - torch.fx.symbolic_trace(f3) -except: - tb.print_exc() - -###################################################################### -# In comparison, ``torch.compile`` is easily able to handle -# the non-PyTorch function call. - -compile_f3 = torch.compile(f3) -print("compile 3:", test_fns(f3, compile_f3, (inp2,))) - -###################################################################### -# TorchDynamo and FX Graphs -# -------------------------- +# Graph Breaks +# ------------------------------------ +# The graph break is one of the most fundamental concepts within ``torch.compile``. +# It allows ``torch.compile`` to handle arbitrary Python code by interrupting +# compilation, running the unsupported code, then resuming compilation. +# The term "graph break" comes from the fact that ``torch.compile`` attempts +# to capture and optimize the PyTorch operation graph. When unsupported Python code is encountered, +# then this graph must be "broken". +# Graph breaks result in lost optimization opportunities, which may still be undesirable, +# but this is better than silent incorrectness or a hard crash. # -# One important component of ``torch.compile`` is TorchDynamo. -# TorchDynamo is responsible for JIT compiling arbitrary Python code into -# `FX graphs `__, which can -# then be further optimized. TorchDynamo extracts FX graphs by analyzing Python bytecode -# during runtime and detecting calls to PyTorch operations. -# -# Normally, TorchInductor, another component of ``torch.compile``, -# further compiles the FX graphs into optimized kernels, -# but TorchDynamo allows for different backends to be used. In order to inspect -# the FX graphs that TorchDynamo outputs, let us create a custom backend that -# outputs the FX graph and simply returns the graph's unoptimized forward method. - -from typing import List -def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): - print("custom backend called with FX graph:") - gm.graph.print_tabular() - return gm.forward - -# Reset since we are using a different backend. -torch._dynamo.reset() +# Let's look at a data-dependent control flow example to better see how graph breaks work. -opt_model = torch.compile(init_model(), backend=custom_backend) -opt_model(generate_data(16)[0]) - -###################################################################### -# Using our custom backend, we can now see how TorchDynamo is able to handle -# data-dependent control flow. Consider the function below, where the line -# ``if b.sum() < 0`` is the source of data-dependent control flow. def bar(a, b): x = a / (torch.abs(a) + 1) @@ -435,23 +314,24 @@ def bar(a, b): b = b * -1 return x * b -opt_bar = torch.compile(bar, backend=custom_backend) -inp1 = torch.randn(10) -inp2 = torch.randn(10) + +opt_bar = torch.compile(bar) +inp1 = torch.ones(10) +inp2 = torch.ones(10) opt_bar(inp1, inp2) opt_bar(inp1, -inp2) ###################################################################### -# The output reveals that TorchDynamo extracted 3 different FX graphs -# corresponding the following code (order may differ from the output above): +# The first time we run ``bar``, we see that ``torch.compile`` traced 2 graphs +# corresponding to the following code (noting that ``b.sum() < 0`` is False): # -# 1. ``x = a / (torch.abs(a) + 1)`` -# 2. ``b = b * -1; return x * b`` -# 3. ``return x * b`` +# 1. ``x = a / (torch.abs(a) + 1); b.sum()`` +# 2. ``return x * b`` # -# When TorchDynamo encounters unsupported Python features, such as data-dependent -# control flow, it breaks the computation graph, lets the default Python -# interpreter handle the unsupported code, then resumes capturing the graph. +# The second time we run ``bar``, we take the other branch of the if statement +# and we get 1 traced graph corresponding to the code ``b = b * -1; return x * b``. +# We do not see a graph of ``x = a / (torch.abs(a) + 1)`` outputted the second time +# since ``torch.compile`` cached this graph from the first run and re-used it. # # Let's investigate by example how TorchDynamo would step through ``bar``. # If ``b.sum() < 0``, then TorchDynamo would run graph 1, let @@ -460,49 +340,83 @@ def bar(a, b): # would run graph 1, let Python determine the result of the conditional, then # run graph 3. # -# This highlights a major difference between TorchDynamo and previous PyTorch -# compiler solutions. When encountering unsupported Python features, -# previous solutions either raise an error or silently fail. -# TorchDynamo, on the other hand, will break the computation graph. -# -# We can see where TorchDynamo breaks the graph by using ``torch._dynamo.explain``: +# We can see all graph breaks by using ``torch._logging.set_logs(graph_breaks=True)``. -# Reset since we are using a different backend. +# Reset to clear the torch.compile cache torch._dynamo.reset() -explain_output = torch._dynamo.explain(bar)(torch.randn(10), torch.randn(10)) -print(explain_output) +opt_bar(inp1, inp2) +opt_bar(inp1, -inp2) ###################################################################### # In order to maximize speedup, graph breaks should be limited. # We can force TorchDynamo to raise an error upon the first graph # break encountered by using ``fullgraph=True``: -opt_bar = torch.compile(bar, fullgraph=True) +# Reset to clear the torch.compile cache +torch._dynamo.reset() + +opt_bar_fullgraph = torch.compile(bar, fullgraph=True) try: - opt_bar(torch.randn(10), torch.randn(10)) + opt_bar_fullgraph(torch.randn(10), torch.randn(10)) except: tb.print_exc() ###################################################################### -# And below, we demonstrate that TorchDynamo does not break the graph on -# the model we used above for demonstrating speedups. +# In our example above, we can work around this graph break by replacing +# the if statement with a ``torch.cond``: + +from functorch.experimental.control_flow import cond + + +@torch.compile(fullgraph=True) +def bar_fixed(a, b): + x = a / (torch.abs(a) + 1) + + def true_branch(y): + return y * -1 + + def false_branch(y): + # NOTE: torch.cond doesn't allow aliased outputs + return y.clone() + + x = cond(b.sum() < 0, true_branch, false_branch, (b,)) + return x * b + + +bar_fixed(inp1, inp2) +bar_fixed(inp1, -inp2) -opt_model = torch.compile(init_model(), fullgraph=True) -print(opt_model(generate_data(16)[0])) ###################################################################### -# We can use ``torch.export`` (from PyTorch 2.1+) to extract a single, exportable -# FX graph from the input PyTorch program. The exported graph is intended to be -# run on different (i.e. Python-less) environments. One important restriction -# is that the ``torch.export`` does not support graph breaks. Please check -# `this tutorial `__ +# In order to serialize graphs or to run graphs on different (i.e. Python-less) +# environments, consider using ``torch.export`` instead (from PyTorch 2.1+). +# One important restriction is that ``torch.export`` does not support graph breaks. Please check +# `the torch.export tutorial `__ # for more details on ``torch.export``. +# +# Check out our `section on graph breaks in the torch.compile programming model `__ +# for tips on how to work around graph breaks. + +###################################################################### +# Troubleshooting +# --------------- +# Is ``torch.compile`` failing to speed up your model? Is compile time unreasonably long? +# Is your code recompiling excessively? Are you having difficulties dealing with graph breaks? +# Are you looking for tips on how to best use ``torch.compile``? +# Or maybe you simply want to learn more about the inner workings of ``torch.compile``? +# +# Check out `the torch.compile programming model `__. ###################################################################### # Conclusion # ------------ # # In this tutorial, we introduced ``torch.compile`` by covering -# basic usage, demonstrating speedups over eager mode, comparing to previous -# PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions -# with FX graphs. We hope that you will give ``torch.compile`` a try! +# basic usage, demonstrating speedups over eager mode, comparing to TorchScript, +# and briefly describing graph breaks. +# +# For an end-to-end example on a real model, check out our `end-to-end torch.compile tutorial `__. +# +# To troubleshoot issues and to gain a deeper understanding of how to apply ``torch.compile`` to your code, check out `the torch.compile programming model `__. +# +# We hope that you will give ``torch.compile`` a try! diff --git a/intermediate_source/torch_export_nightly_tutorial.rst b/intermediate_source/torch_export_nightly_tutorial.rst index 78c710a3449..e7ef2e88153 100644 --- a/intermediate_source/torch_export_nightly_tutorial.rst +++ b/intermediate_source/torch_export_nightly_tutorial.rst @@ -1,858 +1,10 @@ torch.export Nightly Tutorial ============================= -**Author:** William Wen, Zhengxu Chen, Angela Yi +This tutorial has been moved to https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html -.. warning:: +It will redirect in 3 seconds. - ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility - breaking changes. +.. raw:: html -.. note:: - Outputs (e.g. from print statements) are only samples. - -:func:`torch.export` is the PyTorch 2.X way to export PyTorch models into -standardized model representations, intended -to be run on different (i.e. Python-less) environments. - -In this tutorial, you will learn how to use :func:`torch.export` to extract -``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. -We also detail some considerations/modifications that you may need -to make in order to make your model compatible with ``torch.export``. - -**Contents** - -.. contents:: - :local: - -Basic Usage ------------ - -``torch.export`` extracts single-graph representations from PyTorch programs -by tracing the target function, given example inputs. -``torch.export.export()`` is the main entry point for ``torch.export``. - -In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, -though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` -generally refers to the actual function call. - -The signature of ``torch.export.export()`` is: - -.. code-block:: python - - export( - f: Callable, - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, - *, - dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None - ) -> ExportedProgram - -``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` -and wraps it in an ``ExportedProgram``, which can be serialized or executed later with -different inputs. Note that while the output ``ExportedGraph`` is callable and can be -called in the same way as the original input callable, it is not a ``torch.nn.Module``. -We will detail the ``dynamic_shapes`` argument later in the tutorial. - -.. code-block:: python - - import torch - from torch.export import export - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - - mod = MyModule() - exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) - print(type(exported_mod)) - print(exported_mod(torch.randn(8, 100), torch.randn(8, 100))) - -.. code-block:: bash - - - tensor([[0.0000, 1.2178, 0.0000, 0.4397, 0.4774, 0.0000, 0.0000, 0.0943, 0.0000, - 0.4656], - [0.8333, 0.0000, 0.5912, 0.0000, 1.4689, 0.2122, 0.1996, 0.4628, 0.0000, - 0.7495], - [0.0000, 0.0000, 0.3900, 0.0000, 0.0000, 0.0000, 0.4515, 0.0000, 0.8187, - 0.8938], - [0.5753, 0.7709, 0.0000, 0.0000, 0.0000, 0.8081, 0.0000, 0.0000, 0.8002, - 0.9441], - [0.0000, 0.0000, 0.0000, 0.0000, 0.5711, 1.0921, 0.3438, 0.3268, 0.4640, - 0.0000], - [0.0000, 0.0000, 0.0000, 0.2434, 0.7253, 0.6886, 0.0000, 0.6982, 0.5100, - 0.0000], - [0.2279, 0.0000, 1.2951, 1.1055, 0.0000, 0.0000, 0.0000, 0.2088, 0.0000, - 0.5022], - [0.0000, 0.0000, 1.1468, 0.0000, 0.5220, 1.1592, 0.9096, 0.0000, 0.4248, - 1.2142]], grad_fn=) - -Let's review some attributes of ``ExportedProgram`` that are of interest. - -The ``graph`` attribute is an `FX graph `__ -traced from the function we exported, that is, the computation graph of all PyTorch operations. -The FX graph has some important properties: - -- The operations are "ATen-level" operations. -- The graph is "functionalized", meaning that no operations are mutations. - -The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute -so that it can be ran as a ``torch.nn.Module``. - -.. code-block:: python - - print(exported_mod) - print(exported_mod.graph_module) - -.. code-block:: bash - - ExportedProgram: - class GraphModule(torch.nn.Module): - def forward(self, arg0_1: f32[10, 100], arg1_1: f32[10], arg2_1: f32[8, 100], arg3_1: f32[8, 100]): - # File: torch_export_nightly_tutorial.py:69, code: return torch.nn.functional.relu(self.lin(x + y), inplace=True) - add: f32[8, 100] = torch.ops.aten.add.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None - t: f32[100, 10] = torch.ops.aten.t.default(arg0_1); arg0_1 = None - addmm: f32[8, 10] = torch.ops.aten.addmm.default(arg1_1, add, t); arg1_1 = add = t = None - relu: f32[8, 10] = torch.ops.aten.relu.default(addmm); addmm = None - return (relu,) - - Graph signature: ExportGraphSignature(input_specs=[InputSpec(kind=, arg=TensorArgument(name='arg0_1'), target='lin.weight'), InputSpec(kind=, arg=TensorArgument(name='arg1_1'), target='lin.bias'), InputSpec(kind=, arg=TensorArgument(name='arg2_1'), target=None), InputSpec(kind=, arg=TensorArgument(name='arg3_1'), target=None)], output_specs=[OutputSpec(kind=, arg=TensorArgument(name='relu'), target=None)]) - Range constraints: {} - Equality constraints: [] - - GraphModule() - - - - def forward(self, arg0_1, arg1_1, arg2_1, arg3_1): - add = torch.ops.aten.add.Tensor(arg2_1, arg3_1); arg2_1 = arg3_1 = None - t = torch.ops.aten.t.default(arg0_1); arg0_1 = None - addmm = torch.ops.aten.addmm.default(arg1_1, add, t); arg1_1 = add = t = None - relu = torch.ops.aten.relu.default(addmm); addmm = None - return (relu,) - -The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) -and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` -is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. -Future uses of input to the original mutating ``relu`` op are replaced by the additional new output -of the replacement non-mutating ``relu`` op. - -Other attributes of interest in ``ExportedProgram`` include: - -- ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -- ``range_constraints`` and ``equality_constraints`` -- constraints, covered later - -.. code-block:: python - - print(exported_mod.graph_signature) - -.. code-block:: bash - - ExportGraphSignature(parameters=['lin.weight', 'lin.bias'], buffers=[], user_inputs=['arg2_1', 'arg3_1'], user_outputs=['relu'], inputs_to_parameters={'arg0_1': 'lin.weight', 'arg1_1': 'lin.bias'}, inputs_to_buffers={}, buffers_to_mutate={}, backward_signature=None, assertion_dep_token=None) - -See the ``torch.export`` `documentation `__ -for more details. - -Graph Breaks ------------- - -Although ``torch.export`` shares components with ``torch.compile``, -the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not -support graph breaks. This is because handling graph breaks involves interpreting -the unsupported operation with default Python evaluation, which is incompatible -with the export use case. Therefore, in order to make your model code compatible -with ``torch.export``, you will need to modify your code to remove graph breaks. - -A graph break is necessary in cases such as: - -- data-dependent control flow - -.. code-block:: python - - def bad1(x): - if x.sum() > 0: - return torch.sin(x) - return torch.cos(x) - - import traceback as tb - try: - export(bad1, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.UserError: Dynamic control flow is not supported at the moment. Please use functorch.experimental.control_flow.cond to explicitly capture the control flow - - from user code: - File "torch_export_nightly_tutorial.py", line 126, in bad1 - if x.sum() > 0: - -- accessing tensor data with ``.data`` - -.. code-block:: python - - def bad2(x): - x.data[0, 0] = 3 - return x - - try: - export(bad2, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - RuntimeError: - Found following user inputs located at [0] are mutated. This is currently banned in the aot_export workflow. - -- calling unsupported functions (such as many built-in functions) - -.. code-block:: python - - def bad3(x): - x = x + 1 - return x + id(x) - - try: - export(bad3, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.Unsupported: call_id with args (TensorVariable(),) - - from user code: - File "torch_export_nightly_tutorial.py", line 155, in bad3 - return x + id(x) - -- unsupported Python language features (e.g. throwing exceptions, match statements) - -.. code-block:: python - - def bad4(x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - - try: - export(bad4, (torch.randn(3, 3),)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.Unsupported: call_function BuiltinVariable(RuntimeError) [ConstantVariable(str)] {} - - from user code: - File "torch_export_nightly_tutorial.py", line 168, in bad4 - raise RuntimeError("bad") - -The sections below demonstrate some ways you can modify your code -in order to remove graph breaks. - -Control Flow Ops ----------------- - -``torch.export`` actually does support data-dependent control flow. -But these need to be expressed using control flow ops. For example, -we can fix the control flow example above using the ``cond`` op, like so: - -.. code-block:: python - - from functorch.experimental.control_flow import cond - - def bad1_fixed(x): - def true_fn(x): - return torch.sin(x) - def false_fn(x): - return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) - - exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),)) - print(exported_bad1_fixed(torch.ones(3, 3))) - print(exported_bad1_fixed(-torch.ones(3, 3))) - -.. code-block:: bash - - tensor([[0.8415, 0.8415, 0.8415], - [0.8415, 0.8415, 0.8415], - [0.8415, 0.8415, 0.8415]]) - tensor([[0.5403, 0.5403, 0.5403], - [0.5403, 0.5403, 0.5403], - [0.5403, 0.5403, 0.5403]]) - -There are limitations to ``cond`` that one should be aware of: - -- The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor. -- The operands (i.e. ``[x]``) must be tensors. -- The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the - operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.). -- Branch functions cannot mutate input or global variables. -- Branch functions cannot access closure variables, except for ``self`` if the function is - defined in the scope of a method. - -For more details about ``cond``, check out the `documentation `__. - -.. - [NOTE] map is not documented at the moment - We can also use ``map``, which applies a function across the first dimension - of the first tensor argument. - - from functorch.experimental.control_flow import map - - def map_example(xs): - def map_fn(x, const): - def true_fn(x): - return x + const - def false_fn(x): - return x - const - return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) - return control_flow.map(map_fn, xs, torch.tensor([2.0])) - - exported_map_example= export(map_example, (torch.randn(4, 3),)) - inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) - print(exported_map_example(inp)) - -Constraints/Dynamic Shapes --------------------------- - -Ops can have different specializations/behaviors for different tensor shapes, so by default, -``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -example inputs given to the initial ``torch.export.export()`` call. -If we try to run the ``ExportedProgram`` in the example below with a tensor -with a different shape, we get an error: - -.. code-block:: python - - class MyModule2(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - - mod2 = MyModule2() - exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) - - try: - exported_mod2(torch.randn(10, 100), torch.randn(10, 100)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - RuntimeError: Input arg3_1.shape[0] is specialized at 8 - -We can relax this constraint using the ``dynamic_shapes`` argument of -``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` -(`documentation `__), -which dimensions of the input tensors are dynamic. - -For each tensor argument of the input callable, we can specify a mapping from the dimension -to a ``torch.export.Dim``. -A ``torch.export.Dim`` is essentially a named symbolic integer with optional -minimum and maximum bounds. - -Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping -from the input callable's tensor argument names, to dimension --> dim mappings as described above. -If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is -assumed to be static. - -The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. -Then we can specify an optional minimum and maximum bound (inclusive). Below, we show example usage. - -In the example below, our input -``inp1`` has an unconstrained first dimension, but the size of the second -dimension must be in the interval [4, 18]. - -.. code-block:: python - - from torch.export import Dim - - inp1 = torch.randn(10, 10, 2) - - def dynamic_shapes_example1(x): - x = x[:, 2:] - return torch.relu(x) - - inp1_dim0 = Dim("inp1_dim0") - inp1_dim1 = Dim("inp1_dim1", min=4, max=18) - dynamic_shapes1 = { - "x": {0: inp1_dim0, 1: inp1_dim1}, - } - - exported_dynamic_shapes_example1 = export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1) - - print(exported_dynamic_shapes_example1(torch.randn(5, 5, 2))) - - try: - exported_dynamic_shapes_example1(torch.randn(8, 1, 2)) - except Exception: - tb.print_exc() - - try: - exported_dynamic_shapes_example1(torch.randn(8, 20, 2)) - except Exception: - tb.print_exc() - - try: - exported_dynamic_shapes_example1(torch.randn(8, 8, 3)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - tensor([[[0.0000, 0.0828], - [0.8190, 0.0000], - [0.0037, 0.0221]], - - [[0.0000, 2.0898], - [0.0000, 0.0000], - [0.8182, 2.9165]], - - [[1.3572, 0.7422], - [0.4423, 0.0000], - [0.0000, 0.0000]], - - [[0.0000, 0.2497], - [0.0000, 0.1912], - [0.0000, 0.0000]], - - [[0.0000, 1.0522], - [0.4442, 0.0000], - [1.4188, 0.8161]]]) - - RuntimeError: Input arg0_1.shape[1] is outside of specified dynamic range [4, 18] - - RuntimeError: Input arg0_1.shape[1] is outside of specified dynamic range [4, 18] - - RuntimeError: Input arg0_1.shape[2] is specialized at 2 - -Note that if our example inputs to ``torch.export`` do not satisfy the constraints -given by ``dynamic_shapes``, then we get an error. - -.. code-block:: python - - inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) - dynamic_shapes1_bad = { - "x": {0: inp1_dim0, 1: inp1_dim1_bad}, - } - - try: - export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1_bad) - except Exception: - tb.print_exc() - -.. code-block:: python - - torch._dynamo.exc.UserError: 10 not in range [11, 18] - -We can enforce that equalities between dimensions of different tensors -by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: - -.. code-block:: python - - inp2 = torch.randn(4, 8) - inp3 = torch.randn(8, 2) - - def dynamic_shapes_example2(x, y): - return x @ y - - inp2_dim0 = Dim("inp2_dim0") - inner_dim = Dim("inner_dim") - inp3_dim1 = Dim("inp3_dim1") - - dynamic_shapes2 = { - "x": {0: inp2_dim0, 1: inner_dim}, - "y": {0: inner_dim, 1: inp3_dim1}, - } - - exported_dynamic_shapes_example2 = export(dynamic_shapes_example2, (inp2, inp3), dynamic_shapes=dynamic_shapes2) - - print(exported_dynamic_shapes_example2(torch.randn(2, 16), torch.randn(16, 4))) - - try: - exported_dynamic_shapes_example2(torch.randn(4, 8), torch.randn(4, 2)) - except Exception: - tb.print_exc() - -.. code-block:: bash - - tensor([[ 7.5352, -4.3836, -2.8961, 4.3412], - [ 2.3891, 4.9101, -7.4326, -0.1697]]) - - RuntimeError: Input arg0_1.shape[1] is not equal to input arg1_1.shape[0] - -We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints -are necessary. We can do this by relaxing all constraints (recall that if we -do not provide constraints for a dimension, the default behavior is to constrain -to the exact shape value of the example input) and letting ``torch.export`` -error out. - -.. code-block:: python - - inp4 = torch.randn(8, 16) - inp5 = torch.randn(16, 32) - - def dynamic_shapes_example3(x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y - - dynamic_shapes3 = { - "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, - "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, - } - - try: - export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3) - except Exception: - tb.print_exc() - -.. code-block:: bash - - torch._dynamo.exc.UserError: Constraints violated (inp4_dim0, inp5_dim0, inp5_dim1)! For more information, run with TORCH_LOGS=dynamic. - - The values of inp5_dim0 = L['y'].size()[0] and inp4_dim1 = L['x'].size()[1] must always be equal. - - Not all values of inp5_dim1 = L['y'].size()[1] in the specified range satisfy the generated guard Ne(L['y'].size()[1], 16). - - Not all values of inp4_dim0 = L['x'].size()[0] in the specified range satisfy the generated guard L['x'].size()[0] <= 16. - - Not all values of inp5_dim1 = L['y'].size()[1] in the specified range satisfy the generated guard L['y'].size()[1] >= 16. - - Suggested fixes: - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - -We can see that the error message gives us suggested fixes to our -dynamic shape constraints. Let us follow those suggestions (exact -suggestions may differ slightly): - -.. code-block:: python - - def suggested_fixes(): - inp4_dim1 = Dim('shared_dim') - # suggested fixes below - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - # end of suggested fixes - return { - "x": {0: inp4_dim0, 1: inp4_dim1}, - "y": {0: inp5_dim0, 1: inp5_dim1}, - } - - dynamic_shapes3_fixed = suggested_fixes() - exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - print(exported_dynamic_shapes_example3(torch.randn(4, 32), torch.randn(32, 64))) - -.. code-block:: python - - tensor([[ 4.1510, -4.1174, 3.4397, 1.5075, -4.3566, 4.2102, 7.2033, - 0.3611, -3.9041, 8.2987, -3.5751, -7.1508, 0.4470, 2.2460, - -0.9288, -8.1764], - [ -1.5879, -4.5107, -11.0845, -10.3962, -1.4359, 1.2877, -10.2839, - 7.3742, -0.5569, -2.0485, 3.1028, -2.4692, -1.3837, 6.8744, - -9.4191, -5.9387], - [ -3.4660, 2.8480, -2.9857, 11.7783, 0.2220, -5.5934, 1.9793, - 6.1118, 1.9817, -7.6156, 8.2070, -6.6976, -4.8177, -5.4002, - 9.3291, -7.0860], - [ -0.7406, -0.6509, 3.1847, -1.6311, 5.8144, 12.0439, 12.9141, - 8.8778, -9.5971, 4.1847, 5.8781, 0.1364, -7.3096, -4.0822, - -9.0587, 5.3681]]) - -Note that in the example above, because we constrained the value of ``x.shape[0]`` in -``dynamic_shapes_example3``, the exported program is sound even though there is a -raw ``if`` statement. - -If you want to see why ``torch.export`` generated these constraints, you can -re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -or use ``torch._logging.set_logs``. - -.. code-block:: python - - import logging - torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) - exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) - - # reset to previous values - torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) - -.. code-block:: bash - - [2023-10-12 11:24:01,657] [12/0] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo start tracing dynamic_shapes_example3 torch_export_nightly_tutorial.py:374 - [2023-10-12 11:24:01,658] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_env - [2023-10-12 11:24:01,663] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s0 = 8 for L['x'].size()[0] [2, 16] - [2023-10-12 11:24:01,665] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s1 = 16 for L['x'].size()[1] [2, 9223372036854775806] - [2023-10-12 11:24:01,677] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s2 = 16 for L['y'].size()[0] [2, 9223372036854775806] - [2023-10-12 11:24:01,680] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] create_symbol s3 = 32 for L['y'].size()[1] [17, 9223372036854775806] - [2023-10-12 11:24:01,734] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] eval Eq(s1, s2) [guard added] at torch_export_nightly_tutorial.py:376 in dynamic_shapes_example3 (_meta_registrations.py:1891 in meta_mm) - [2023-10-12 11:24:01,738] [12/0] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo done tracing dynamic_shapes_example3 (RETURN_VALUE) - [2023-10-12 11:24:01,743] [12/0] torch._dynamo.output_graph: [INFO] Step 2: calling compiler function dynamo_normalization_capturing_compiler - [2023-10-12 11:24:01,743] [12/0] torch._dynamo.output_graph: [INFO] Step 2: done compiler function dynamo_normalization_capturing_compiler - [2023-10-12 11:24:01,747] [12/0] torch.fx.experimental.symbolic_shapes: [INFO] produce_guards - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] Summary of dimension constraints: - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] Suggested fixes: - [2023-10-12 11:24:01,839] torch._dynamo.eval_frame: [INFO] - [2023-10-12 11:24:01,847] torch.fx.experimental.symbolic_shapes: [INFO] create_env - -We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and -``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...`` -represent. - -.. code-block:: python - - print(exported_dynamic_shapes_example3.range_constraints) - print(exported_dynamic_shapes_example3.equality_constraints) - -.. code-block:: bash - - {s0: RangeConstraint(min_val=2, max_val=16), s1: RangeConstraint(min_val=2, max_val=9223372036854775806), s2: RangeConstraint(min_val=2, max_val=9223372036854775806), s3: RangeConstraint(min_val=17, max_val=9223372036854775806)} - [(InputDim(input_name='arg0_1', dim=1), InputDim(input_name='arg1_1', dim=0))] - -Custom Ops ----------- - -``torch.export`` can export PyTorch programs with custom operators. - -Currently, the steps to register a custom op for use by ``torch.export`` are: - -- Define the custom op using ``torch.library`` (`reference `__) - as with any other custom op - -.. code-block:: python - - from torch.library import Library, impl - - m = Library("my_custom_library", "DEF") - - m.define("custom_op(Tensor input) -> Tensor") - - @impl(m, "custom_op", "CompositeExplicitAutograd") - def custom_op(x): - print("custom_op called!") - return torch.relu(x) - -- Define a ``"Meta"`` implementation of the custom op that returns an empty - tensor with the same shape as the expected output - -.. code-block:: python - - @impl(m, "custom_op", "Meta") - def custom_op_meta(x): - return torch.empty_like(x) - -- Call the custom op from the code you want to export using ``torch.ops`` - -.. code-block:: python - - def custom_op_example(x): - x = torch.sin(x) - x = torch.ops.my_custom_library.custom_op(x) - x = torch.cos(x) - return x - -- Export the code as before - -.. code-block:: python - - exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),)) - exported_custom_op_example.graph_module.print_readable() - print(exported_custom_op_example(torch.randn(3, 3))) - -.. code-block:: bash - - custom_op called! - tensor([[0.5947, 0.8062, 0.6231], - [1.0000, 1.0000, 0.6615], - [0.5412, 1.0000, 1.0000]]) - -Note in the above outputs that the custom op is included in the exported graph. -And when we call the exported graph as a function, the original custom op is called, -as evidenced by the ``print`` call. - -If you have a custom operator implemented in C++, please refer to -`this document `__ -to make it compatible with ``torch.export``. - -Decompositions --------------- - -The graph produced by ``torch.export`` by default returns a graph containing -only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 -operators, all of which are functional, that is, they do not -mutate or alias inputs. You can find a list of all ATen operators -`here `__ -and you can inspect if an operator is functional by checking -``op._schema.is_mutable``, for example: - -.. code-block:: python - - print(torch.ops.aten.add.Tensor._schema.is_mutable) - print(torch.ops.aten.add_.Tensor._schema.is_mutable) - -.. code-block:: bash - - False - True - -By default, the environment in which you want to run the exported graph -should support all ~2000 of these operators. -However, you can use the following API on the exported program -if your specific environment is only able to support a subset of -the ~2000 operators. - -.. code-block:: python - - def run_decompositions( - self: ExportedProgram, - decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] - ) -> ExportedProgram - -``run_decompositions`` takes in a decomposition table, which is a mapping of -operators to a function specifying how to reduce, or decompose, that operator -into an equivalent sequence of other ATen operators. - -The default decomposition table for ``run_decompositions`` is the -`Core ATen decomposition table `__ -which will decompose the all ATen operators to the -`Core ATen Operator Set `__ -which consists of only ~180 operators. - -.. code-block:: python - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - - ep = export(M(), (torch.randn(2, 3),)) - print(ep.graph) - - core_ir_ep = ep.run_decompositions() - print(core_ir_ep.graph) - -.. code-block:: bash - - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%arg0_1,), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %t), kwargs = {}) - return (addmm,) - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [1, 0]), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %permute), kwargs = {}) - return (addmm,) - -Notice that after running ``run_decompositions`` the -``torch.ops.aten.t.default`` operator, which is not part of the Core ATen -Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part -of the Core ATen Opset. - -Most ATen operators already have decompositions, which are located -`here `__. -If you would like to use some of these existing decomposition functions, -you can pass in a list of operators you would like to decompose to the -`get_decompositions `__ -function, which will return a decomposition table using existing -decomposition implementations. - -.. code-block:: python - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) - - def forward(self, x): - return self.linear(x) - - ep = export(M(), (torch.randn(2, 3),)) - print(ep.graph) - - from torch._decomp import get_decompositions - decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) - core_ir_ep = ep.run_decompositions(decomp_table) - print(core_ir_ep.graph) - -.. code-block:: bash - - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%arg0_1,), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %t), kwargs = {}) - return (addmm,) - graph(): - %arg0_1 : [num_users=1] = placeholder[target=arg0_1] - %arg1_1 : [num_users=1] = placeholder[target=arg1_1] - %arg2_1 : [num_users=1] = placeholder[target=arg2_1] - %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [1, 0]), kwargs = {}) - %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%arg1_1, %arg2_1, %permute), kwargs = {}) - return (addmm,) - -If there is no existing decomposition function for an ATen operator that you would -like to decompose, feel free to send a pull request into PyTorch -implementing the decomposition! - -ExportDB --------- - -``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement, -there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to -rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting -if-statements using ``cond``. - -`ExportDB `__ is the standard reference that documents -supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each -of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``. -Examples are also tagged by category so that they can be more easily searched. - -For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator. -We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like: - -.. code-block:: python - - def cond_predicate(x): - """ - The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: - - torch.Tensor with a single element - - boolean expression - NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. - """ - pred = x.dim() > 2 and x.shape[2] > 10 - return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x]) - -More generally, ExportDB can be used as a reference when one of the following occurs: - -1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features - and you want to know if ``torch.export`` covers that feature. -2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it. - -ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach -out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. - -Conclusion ----------- - -We introduced ``torch.export``, the new PyTorch 2.X way to export single computation -graphs from PyTorch programs. In particular, we demonstrate several code modifications -and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph. + diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py index 98016833c4a..3ca6d09a52f 100644 --- a/intermediate_source/torch_export_tutorial.py +++ b/intermediate_source/torch_export_tutorial.py @@ -3,7 +3,7 @@ """ torch.export Tutorial =================================================== -**Author:** William Wen, Zhengxu Chen, Angela Yi +**Author:** William Wen, Zhengxu Chen, Angela Yi, Pian Pawakapan """ ###################################################################### @@ -11,7 +11,7 @@ # .. warning:: # # ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility -# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.3. +# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.5. # # :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into # standardized model representations, intended @@ -45,17 +45,18 @@ # .. code-block:: python # # export( -# f: Callable, +# mod: torch.nn.Module, # args: Tuple[Any, ...], # kwargs: Optional[Dict[str, Any]] = None, # *, # dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None # ) -> ExportedProgram # -# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` +# ``torch.export.export()`` traces the tensor computation graph from calling ``mod(*args, **kwargs)`` # and wraps it in an ``ExportedProgram``, which can be serialized or executed later with -# different inputs. Note that while the output ``ExportedGraph`` is callable and can be -# called in the same way as the original input callable, it is not a ``torch.nn.Module``. +# different inputs. To execute the ``ExportedProgram`` we can call ``.module()`` +# on it to return a ``torch.nn.Module`` which is callable, just like the +# original program. # We will detail the ``dynamic_shapes`` argument later in the tutorial. import torch @@ -80,30 +81,15 @@ def forward(self, x, y): # # The ``graph`` attribute is an `FX graph `__ # traced from the function we exported, that is, the computation graph of all PyTorch operations. -# The FX graph has some important properties: +# The FX graph is in "ATen IR" meaning that it contains only "ATen-level" operations. # -# - The operations are "ATen-level" operations. -# - The graph is "functionalized", meaning that no operations are mutations. +# The ``graph_signature`` attribute gives a more detailed description of the +# input and output nodes in the exported graph, describing which ones are +# parameters, buffers, user inputs, or user outputs. # -# The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute -# so that it can be ran as a ``torch.nn.Module``. +# The ``range_constraints`` attributes will be covered later. print(exported_mod) -print(exported_mod.graph_module) - -###################################################################### -# The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) -# and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` -# is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. -# Future uses of input to the original mutating ``relu`` op are replaced by the additional new output -# of the replacement non-mutating ``relu`` op. -# -# Other attributes of interest in ``ExportedProgram`` include: -# -# - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -# - ``range_constraints`` -- constraints, covered later - -print(exported_mod.graph_signature) ###################################################################### # See the ``torch.export`` `documentation `__ @@ -163,32 +149,16 @@ def forward(self, x): except Exception: tb.print_exc() -###################################################################### -# - unsupported Python language features (e.g. throwing exceptions, match statements) - -class Bad4(torch.nn.Module): - def forward(self, x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - -try: - export(Bad4(), (torch.randn(3, 3),)) -except Exception: - tb.print_exc() ###################################################################### # Non-Strict Export # ----------------- # -# To trace the program, ``torch.export`` uses TorchDynamo, a byte code analysis -# engine, to symbolically analyze the Python code and build a graph based on the -# results. This analysis allows ``torch.export`` to provide stronger guarantees -# about safety, but not all Python code is supported, causing these graph -# breaks. +# To trace the program, ``torch.export`` uses TorchDynamo by default, a byte +# code analysis engine, to symbolically analyze the Python code and build a +# graph based on the results. This analysis allows ``torch.export`` to provide +# stronger guarantees about safety, but not all Python code is supported, +# causing these graph breaks. # # To address this issue, in PyTorch 2.3, we introduced a new mode of # exporting called non-strict mode, where we trace through the program using the @@ -197,16 +167,6 @@ def forward(self, x): # ``strict=False`` flag. # # Looking at some of the previous examples which resulted in graph breaks: -# -# - Accessing tensor data with ``.data`` now works correctly - -class Bad2(torch.nn.Module): - def forward(self, x): - x.data[0, 0] = 3 - return x - -bad2_nonstrict = export(Bad2(), (torch.randn(3, 3),), strict=False) -print(bad2_nonstrict.module()(torch.ones(3, 3))) ###################################################################### # - Calling unsupported functions (such as many built-in functions) traces @@ -223,22 +183,6 @@ def forward(self, x): print(bad3_nonstrict) print(bad3_nonstrict.module()(torch.ones(3, 3))) -###################################################################### -# - Unsupported Python language features (such as throwing exceptions, match -# statements) now also get traced through. - -class Bad4(torch.nn.Module): - def forward(self, x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x - -bad4_nonstrict = export(Bad4(), (torch.randn(3, 3),), strict=False) -print(bad4_nonstrict.module()(torch.ones(3, 3))) - ###################################################################### # However, there are still some features that require rewrites to the original @@ -252,17 +196,16 @@ def forward(self, x): # But these need to be expressed using control flow ops. For example, # we can fix the control flow example above using the ``cond`` op, like so: -from functorch.experimental.control_flow import cond - class Bad1Fixed(torch.nn.Module): def forward(self, x): def true_fn(x): return torch.sin(x) def false_fn(x): return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) + return torch.cond(x.sum() > 0, true_fn, false_fn, [x]) exported_bad1_fixed = export(Bad1Fixed(), (torch.randn(3, 3),)) +print(exported_bad1_fixed) print(exported_bad1_fixed.module()(torch.ones(3, 3))) print(exported_bad1_fixed.module()(-torch.ones(3, 3))) @@ -280,294 +223,589 @@ def false_fn(x): # For more details about ``cond``, check out the `cond documentation `__. ###################################################################### -# .. -# [NOTE] map is not documented at the moment -# We can also use ``map``, which applies a function across the first dimension -# of the first tensor argument. -# -# from functorch.experimental.control_flow import map -# -# def map_example(xs): -# def map_fn(x, const): -# def true_fn(x): -# return x + const -# def false_fn(x): -# return x - const -# return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) -# return control_flow.map(map_fn, xs, torch.tensor([2.0])) -# -# exported_map_example= export(map_example, (torch.randn(4, 3),)) -# inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) -# print(exported_map_example(inp)) +# We can also use ``map``, which applies a function across the first dimension +# of the first tensor argument. + +from torch._higher_order_ops.map import map as torch_map + +class MapModule(torch.nn.Module): + def forward(self, xs, y, z): + def body(x, y, z): + return x + y + z + + return torch_map(body, xs, y, z) + +inps = (torch.ones(6, 4), torch.tensor(5), torch.tensor(4)) +exported_map_example = export(MapModule(), inps) +print(exported_map_example) +print(exported_map_example.module()(*inps)) + +###################################################################### +# Other control flow ops include ``while_loop``, ``associative_scan``, and +# ``scan``. For more documentation on each operator, please refer to +# `this page `__. ###################################################################### # Constraints/Dynamic Shapes # -------------------------- # -# Ops can have different specializations/behaviors for different tensor shapes, so by default, -# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -# example inputs given to the initial ``torch.export.export()`` call. -# If we try to run the ``ExportedProgram`` in the example below with a tensor -# with a different shape, we get an error: +# This section covers dynamic behavior and representation of exported programs. Dynamic behavior is +# subjective to the particular model being exported, so for the most part of this tutorial, we'll focus +# on this particular toy model (with the resulting tensor shapes annotated): -class MyModule2(torch.nn.Module): +class DynamicModel(torch.nn.Module): def __init__(self): super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) - -mod2 = MyModule2() -exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) - + self.l = torch.nn.Linear(5, 3) + + def forward( + self, + w: torch.Tensor, # [6, 5] + x: torch.Tensor, # [4] + y: torch.Tensor, # [8, 4] + z: torch.Tensor, # [32] + ): + x0 = x + y # [8, 4] + x1 = self.l(w) # [6, 3] + x2 = x0.flatten() # [32] + x3 = x2 + z # [32] + return x1, x3 + +###################################################################### +# By default, ``torch.export`` produces a static program. One consequence of this is that at runtime, +# the program won't work on inputs with different shapes, even if they're valid in eager mode. + +w = torch.randn(6, 5) +x = torch.randn(4) +y = torch.randn(8, 4) +z = torch.randn(32) +model = DynamicModel() +ep = export(model, (w, x, y, z)) +model(w, x, torch.randn(3, 4), torch.randn(12)) try: - exported_mod2.module()(torch.randn(10, 100), torch.randn(10, 100)) + ep.module()(w, x, torch.randn(3, 4), torch.randn(12)) except Exception: tb.print_exc() ###################################################################### -# We can relax this constraint using the ``dynamic_shapes`` argument of -# ``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` -# (`documentation `__), -# which dimensions of the input tensors are dynamic. +# Basic concepts: symbols and guards +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# To enable dynamism, ``export()`` provides a ``dynamic_shapes`` argument. The easiest way to work with +# dynamic shapes is using ``Dim.AUTO`` and looking at the program that's returned. Dynamic behavior is specified +# at a input dimension-level; for each input we can specify a tuple of values: + +from torch.export.dynamic_shapes import Dim + +dynamic_shapes = { + "w": (Dim.AUTO, Dim.AUTO), + "x": (Dim.AUTO,), + "y": (Dim.AUTO, Dim.AUTO), + "z": (Dim.AUTO,), +} +ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) + +###################################################################### +# Before we look at the program that's produced, let's understand what specifying ``dynamic_shapes`` entails, +# and how that interacts with export. For every input dimension where a ``Dim`` object is specified, a symbol is +# `allocated `_, +# taking on a range of ``[2, inf]`` (why not ``[0, inf]`` or ``[1, inf]``? we'll explain later in the +# 0/1 specialization section). +# +# Export then runs model tracing, looking at each operation that's performed by the model. Each individual operation can emit +# what's called "guards"; basically boolean condition that are required to be true for the program to be valid. +# When guards involve symbols allocated for input dimensions, the program contains restrictions on what input shapes are valid; +# i.e. the program's dynamic behavior. The symbolic shapes subsystem is the part responsible for taking in all the emitted guards +# and producing a final program representation that adheres to all of these guards. Before we see this "final representation" in +# an ``ExportedProgram``, let's look at the guards emitted by the toy model we're tracing. +# +# Here, each forward input tensor is annotated with the symbol allocated at the start of tracing: + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.Linear(5, 3) + + def forward( + self, + w: torch.Tensor, # [s0, s1] + x: torch.Tensor, # [s2] + y: torch.Tensor, # [s3, s4] + z: torch.Tensor, # [s5] + ): + x0 = x + y # guard: s2 == s4 + x1 = self.l(w) # guard: s1 == 5 + x2 = x0.flatten() # no guard added here + x3 = x2 + z # guard: s3 * s4 == s5 + return x1, x3 + +###################################################################### +# Let's understand each of the operations and the emitted guards: # -# For each tensor argument of the input callable, we can specify a mapping from the dimension -# to a ``torch.export.Dim``. -# A ``torch.export.Dim`` is essentially a named symbolic integer with optional -# minimum and maximum bounds. +# - ``x0 = x + y``: This is an element-wise add with broadcasting, since ``x`` is a 1-d tensor and ``y`` a 2-d tensor. ``x`` is broadcasted along the last dimension of ``y``, emitting the guard ``s2 == s4``. +# - ``x1 = self.l(w)``: Calling ``nn.Linear()`` performs a matrix multiplication with model parameters. In export, parameters, buffers, and constants are considered program state, which is considered static, and so this is a matmul between a dynamic input (``w: [s0, s1]``), and a statically-shaped tensor. This emits the guard ``s1 == 5``. +# - ``x2 = x0.flatten()``: This call actually doesn't emit any guards! (at least none relevant to input shapes) +# - ``x3 = x2 + z``: ``x2`` has shape ``[s3*s4]`` after flattening, and this element-wise add emits ``s3 * s4 == s5``. # -# Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping -# from the input callable's tensor argument names, to dimension --> dim mappings as described above. -# If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is -# assumed to be static. +# Writing all of these guards down and summarizing is almost like a mathematical proof, which is what the symbolic shapes +# subsystem tries to do! In summary, we can conclude that the program must have the following input shapes to be valid: # -# The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. -# Then we can specify an optional minimum and maximum bound (inclusive). Below, we show a usage example. +# - ``w: [s0, 5]`` +# - ``x: [s2]`` +# - ``y: [s3, s2]`` +# - ``z: [s2*s3]`` # -# In the example below, our input -# ``inp1`` has an unconstrained first dimension, but the size of the second -# dimension must be in the interval [4, 18]. +# And when we do finally print out the exported program to see our result, those shapes are what we see annotated on the +# corresponding inputs: -from torch.export import Dim +print(ep) -inp1 = torch.randn(10, 10, 2) +###################################################################### +# Another feature to notice is the range_constraints field above, which contains a valid range for each symbol. This isn't +# so interesting currently, since this export call doesn't emit any guards related to symbol bounds and each base symbol has +# a generic bound, but this will come up later. +# +# So far, because we've been exporting this toy model, this experience has not been representative of how hard +# it typically is to debug dynamic shapes guards & issues. In most cases it isn't obvious what guards are being emitted, +# and which operations and parts of user code are responsible. For this toy model we pinpoint the exact lines, and the guards +# are rather intuitive. +# +# In more complicated cases, a helpful first step is always to enable verbose logging. This can be done either with the environment +# variable ``TORCH_LOGS="+dynamic"``, or interactively with ``torch._logging.set_logs(dynamic=10)``: -class DynamicShapesExample1(torch.nn.Module): - def forward(self, x): - x = x[:, 2:] - return torch.relu(x) +torch._logging.set_logs(dynamic=10) +ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) -inp1_dim0 = Dim("inp1_dim0") -inp1_dim1 = Dim("inp1_dim1", min=4, max=18) -dynamic_shapes1 = { - "x": {0: inp1_dim0, 1: inp1_dim1}, -} +###################################################################### +# This spits out quite a handful, even with this simple toy model. The log lines here have been cut short at front and end +# to ignore unnecessary info, but looking through the logs we can see the lines relevant to what we described above; +# e.g. the allocation of symbols: -exported_dynamic_shapes_example1 = export(DynamicShapesExample1(), (inp1,), dynamic_shapes=dynamic_shapes1) +""" +create_symbol s0 = 6 for L['w'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s1 = 5 for L['w'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +runtime_assert True == True [statically known] +create_symbol s2 = 4 for L['x'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s3 = 8 for L['y'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s4 = 4 for L['y'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s5 = 32 for L['z'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +""" -print(exported_dynamic_shapes_example1.module()(torch.randn(5, 5, 2))) +###################################################################### +# The lines with `create_symbol` show when a new symbol has been allocated, and the logs also identify the tensor variable names +# and dimensions they've been allocated for. In other lines we can also see the guards emitted: +""" +runtime_assert Eq(s2, s4) [guard added] x0 = x + y # output shape: [8, 4] # dynamic_shapes_tutorial.py:16 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2, s4)" +runtime_assert Eq(s1, 5) [guard added] x1 = self.l(w) # [6, 3] # dynamic_shapes_tutorial.py:17 in forward (_meta_registrations.py:2127 in meta_mm), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s1, 5)" +runtime_assert Eq(s2*s3, s5) [guard added] x3 = x2 + z # [32] # dynamic_shapes_tutorial.py:19 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2*s3, s5)" +""" + +###################################################################### +# Next to the ``[guard added]`` messages, we also see the responsible user lines of code - luckily here the model is simple enough. +# In many real-world cases it's not so straightforward: high-level torch operations can have complicated fake-kernel implementations +# or operator decompositions that complicate where and what guards are emitted. In such cases the best way to dig deeper and investigate +# is to follow the logs' suggestion, and re-run with environment variable ``TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="..."``, to further +# attribute the guard of interest. +# +# ``Dim.AUTO`` is just one of the available options for interacting with ``dynamic_shapes``; as of writing this 2 other options are available: +# ``Dim.DYNAMIC``, and ``Dim.STATIC``. ``Dim.STATIC`` simply marks a dimension static, while ``Dim.DYNAMIC`` is similar to ``Dim.AUTO`` in all +# ways except one: it raises an error when specializing to a constant; this is designed to maintain dynamism. See for example what happens when a +# static guard is emitted on a dynamically-marked dimension: + +dynamic_shapes["w"] = (Dim.AUTO, Dim.DYNAMIC) try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 1, 2)) + export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) except Exception: tb.print_exc() +###################################################################### +# Static guards also aren't always inherent to the model; they can also come from user specifications. In fact, a common pitfall leading to shape +# specializations is when the user specifies conflicting markers for equivalent dimensions; one dynamic and another static. The same error type is +# raised when this is the case for ``x.shape[0]`` and ``y.shape[1]``: + +dynamic_shapes["w"] = (Dim.AUTO, Dim.AUTO) +dynamic_shapes["x"] = (Dim.STATIC,) +dynamic_shapes["y"] = (Dim.AUTO, Dim.DYNAMIC) try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 20, 2)) + export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) except Exception: tb.print_exc() +###################################################################### +# Here you might ask why export "specializes", i.e. why we resolve this static/dynamic conflict by going with the static route. The answer is because +# of the symbolic shapes system described above, of symbols and guards. When ``x.shape[0]`` is marked static, we don't allocate a symbol, and compile +# treating this shape as a concrete integer 4. A symbol is allocated for ``y.shape[1]``, and so we finally emit the guard ``s3 == 4``, leading to +# specialization. +# +# One feature of export is that during tracing, statements like asserts, ``torch._check()``, and ``if/else`` conditions will also emit guards. +# See what happens when we augment the existing model with such statements: + +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.Linear(5, 3) + + def forward(self, w, x, y, z): + assert w.shape[0] <= 512 + torch._check(x.shape[0] >= 4) + if w.shape[0] == x.shape[0] + 2: + x0 = x + y + x1 = self.l(w) + x2 = x0.flatten() + x3 = x2 + z + return x1, x3 + else: + return w + +dynamic_shapes = { + "w": (Dim.AUTO, Dim.AUTO), + "x": (Dim.AUTO,), + "y": (Dim.AUTO, Dim.AUTO), + "z": (Dim.AUTO,), +} try: - exported_dynamic_shapes_example1.module()(torch.randn(8, 8, 3)) + ep = export(DynamicModel(), (w, x, y, z), dynamic_shapes=dynamic_shapes) except Exception: tb.print_exc() ###################################################################### -# Note that if our example inputs to ``torch.export`` do not satisfy the constraints -# given by ``dynamic_shapes``, then we get an error. +# Each of these statements emits an additional guard, and the exported program shows the changes; ``s0`` is eliminated in favor of ``s2 + 2``, +# and ``s2`` now contains lower and upper bounds, reflected in ``range_constraints``. +# +# For the if/else condition, you might ask why the True branch was taken, and why it wasn't the ``w.shape[0] != x.shape[0] + 2`` guard that +# got emitted from tracing. The answer is that export is guided by the sample inputs provided by tracing, and specializes on the branches taken. +# If different sample input shapes were provided that fail the ``if`` condition, export would trace and emit guards corresponding to the ``else`` branch. +# Additionally, you might ask why we traced only the ``if`` branch, and if it's possible to maintain control-flow in your program and keep both branches +# alive. For that, refer to rewriting your model code following the ``Control Flow Ops`` section above. -inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) -dynamic_shapes1_bad = { - "x": {0: inp1_dim0, 1: inp1_dim1_bad}, -} +###################################################################### +# 0/1 specialization +# ^^^^^^^^^^^^^^^^^^ +# +# Since we're talking about guards and specializations, it's a good time to talk about the 0/1 specialization issue we brought up earlier. +# The bottom line is that export will specialize on sample input dimensions with value 0 or 1, because these shapes have trace-time properties that +# don't generalize to other shapes. For example, size 1 tensors can broadcast while other sizes fail; and size 0 ... . This just means that you should +# specify 0/1 sample inputs when you'd like your program to hardcode them, and non-0/1 sample inputs when dynamic behavior is desirable. See what happens +# at runtime when we export this linear layer: +ep = export( + torch.nn.Linear(4, 3), + (torch.randn(1, 4),), + dynamic_shapes={ + "input": (Dim.AUTO, Dim.STATIC), + }, +) try: - export(DynamicShapesExample1(), (inp1,), dynamic_shapes=dynamic_shapes1_bad) + ep.module()(torch.randn(2, 4)) except Exception: tb.print_exc() ###################################################################### -# We can enforce that equalities between dimensions of different tensors -# by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: - -inp2 = torch.randn(4, 8) -inp3 = torch.randn(8, 2) +# Named Dims +# ^^^^^^^^^^ +# +# So far we've only been talking about 3 ways to specify dynamic shapes: ``Dim.AUTO``, ``Dim.DYNAMIC``, and ``Dim.STATIC``. The attraction of these is the +# low-friction user experience; all the guards emitted during model tracing are adhered to, and dynamic behavior like min/max ranges, relations, and static/dynamic +# dimensions are automatically figured out underneath export. The dynamic shapes subsystem essentially acts as a "discovery" process, summarizing these guards +# and presenting what export believes is the overall dynamic behavior of the program. The drawback of this design appears once the user has stronger expectations or +# beliefs about the dynamic behavior of these models - maybe there is a strong desire on dynamism and specializations on particular dimensions are to be avoided at +# all costs, or maybe we just want to catch changes in dynamic behavior with changes to the original model code, or possibly underlying decompositions or meta-kernels. +# These changes won't be detected and the ``export()`` call will most likely succeed, unless tests are in place that check the resulting ``ExportedProgram`` representation. +# +# For such cases, our stance is to recommend the "traditional" way of specifying dynamic shapes, which longer-term users of export might be familiar with: named ``Dims``: -class DynamicShapesExample2(torch.nn.Module): - def forward(self, x, y): - return x @ y +dx = Dim("dx", min=4, max=256) +dh = Dim("dh", max=512) +dynamic_shapes = { + "x": (dx, None), + "y": (2 * dx, dh), +} -inp2_dim0 = Dim("inp2_dim0") -inner_dim = Dim("inner_dim") -inp3_dim1 = Dim("inp3_dim1") +###################################################################### +# This style of dynamic shapes allows the user to specify what symbols are allocated for input dimensions, min/max bounds on those symbols, and places restrictions on the +# dynamic behavior of the ``ExportedProgram`` produced; ``ConstraintViolation`` errors will be raised if model tracing emits guards that conflict with the relations or static/dynamic +# specifications given. For example, in the above specification, the following is asserted: +# +# - ``x.shape[0]`` is to have range ``[4, 256]``, and related to ``y.shape[0]`` by ``y.shape[0] == 2 * x.shape[0]``. +# - ``x.shape[1]`` is static. +# - ``y.shape[1]`` has range ``[2, 512]``, and is unrelated to any other dimension. +# +# In this design, we allow relations between dimensions to be specified with univariate linear expressions: ``A * dim + B`` can be specified for any dimension. This allows users +# to specify more complex constraints like integer divisibility for dynamic dimensions: -dynamic_shapes2 = { - "x": {0: inp2_dim0, 1: inner_dim}, - "y": {0: inner_dim, 1: inp3_dim1}, +dx = Dim("dx", min=4, max=512) +dynamic_shapes = { + "x": (4 * dx, None) # x.shape[0] has range [16, 2048], and is divisible by 4. } -exported_dynamic_shapes_example2 = export(DynamicShapesExample2(), (inp2, inp3), dynamic_shapes=dynamic_shapes2) +###################################################################### +# Constraint violations, suggested fixes +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# One common issue with this specification style (before ``Dim.AUTO`` was introduced), is that the specification would often be mismatched with what was produced by model tracing. +# That would lead to ``ConstraintViolation`` errors and export suggested fixes - see for example with this model & specification, where the model inherently requires equality between +# dimensions 0 of ``x`` and ``y``, and requires dimension 1 to be static. -print(exported_dynamic_shapes_example2.module()(torch.randn(2, 16), torch.randn(16, 4))) +class Foo(torch.nn.Module): + def forward(self, x, y): + w = x + y + return w + torch.ones(4) +dx, dy, d1 = torch.export.dims("dx", "dy", "d1") try: - exported_dynamic_shapes_example2.module()(torch.randn(4, 8), torch.randn(4, 2)) + ep = export( + Foo(), + (torch.randn(6, 4), torch.randn(6, 4)), + dynamic_shapes={ + "x": (dx, d1), + "y": (dy, d1), + }, + ) except Exception: tb.print_exc() ###################################################################### -# We can also describe one dimension in terms of other. There are some -# restrictions to how detailed we can specify one dimension in terms of another, -# but generally, those in the form of ``A * Dim + B`` should work. - -class DerivedDimExample1(torch.nn.Module): - def forward(self, x, y): - return x + y[1:] +# The expectation with suggested fixes is that the user can interactively copy-paste the changes into their dynamic shapes specification, and successfully export afterwards. +# +# Lastly, there's couple nice-to-knows about the options for specification: +# +# - ``None`` is a good option for static behavior: +# - ``dynamic_shapes=None`` (default) exports with the entire model being static. +# - specifying ``None`` at an input-level exports with all tensor dimensions static, and is also required for non-tensor inputs. +# - specifying ``None`` at a dimension-level specializes that dimension, though this is deprecated in favor of ``Dim.STATIC``. +# - specifying per-dimension integer values also produces static behavior, and will additionally check that the provided sample input matches the specification. +# +# These options are combined in the inputs & dynamic shapes spec below: -foo = DerivedDimExample1() +inputs = ( + torch.randn(4, 4), + torch.randn(3, 3), + 16, + False, +) +dynamic_shapes = { + "tensor_0": (Dim.AUTO, None), + "tensor_1": None, + "int_val": None, + "bool_val": None, +} -x, y = torch.randn(5), torch.randn(6) -dimx = torch.export.Dim("dimx", min=3, max=6) -dimy = dimx + 1 -derived_dynamic_shapes1 = ({0: dimx}, {0: dimy}) +###################################################################### +# Data-dependent errors +# --------------------- +# +# While trying to export models, you have may have encountered errors like "Could not guard on data-dependent expression", or Could not extract specialized integer from data-dependent expression". +# These errors exist because ``torch.export()`` compiles programs using FakeTensors, which symbolically represent their real tensor counterparts. While these have equivalent symbolic properties +# (e.g. sizes, strides, dtypes), they diverge in that FakeTensors do not contain any data values. While this avoids unnecessary memory usage and expensive computation, it does mean that export may be +# unable to out-of-the-box compile parts of user code where compilation relies on data values. In short, if the compiler requires a concrete, data-dependent value in order to proceed, it will error out, +# complaining that the value is not available. +# +# Data-dependent values appear in many places, and common sources are calls like ``item()``, ``tolist()``, or ``torch.unbind()`` that extract scalar values from tensors. +# How are these values represented in the exported program? In the `Constraints/Dynamic Shapes `_ +# section, we talked about allocating symbols to represent dynamic input dimensions. +# The same happens here: we allocate symbols for every data-dependent value that appears in the program. The important distinction is that these are "unbacked" symbols, +# in contrast to the "backed" symbols allocated for input dimensions. The `"backed/unbacked" `_ +# nomenclature refers to the presence/absence of a "hint" for the symbol: a concrete value backing the symbol, that can inform the compiler on how to proceed. +# +# In the input shape symbol case (backed symbols), these hints are simply the sample input shapes provided, which explains why control-flow branching is determined by the sample input properties. +# For data-dependent values, the symbols are taken from FakeTensor "data" during tracing, and so the compiler doesn't know the actual values (hints) that these symbols would take on. +# +# Let's see how these show up in exported programs: -derived_dim_example1 = export(foo, (x, y), dynamic_shapes=derived_dynamic_shapes1) +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = y.tolist() + return b + [a] -print(derived_dim_example1.module()(torch.randn(4), torch.randn(5))) +inps = ( + torch.tensor(1), + torch.tensor([2, 3]), +) +ep = export(Foo(), inps) +print(ep) -try: - derived_dim_example1.module()(torch.randn(4), torch.randn(6)) -except Exception: - tb.print_exc() +###################################################################### +# The result is that 3 unbacked symbols (notice they're prefixed with "u", instead of the usual "s" for input shape/backed symbols) are allocated and returned: +# 1 for the ``item()`` call, and 1 for each of the elements of ``y`` with the ``tolist()`` call. +# Note from the range constraints field that these take on ranges of ``[-int_oo, int_oo]``, not the default ``[0, int_oo]`` range allocated to input shape symbols, +# since we have no information on what these values are - they don't represent sizes, so don't necessarily have positive values. +###################################################################### +# Guards, torch._check() +# ^^^^^^^^^^^^^^^^^^^^^^ +# +# But the case above is easy to export, because the concrete values of these symbols aren't used in any compiler decision-making; all that's relevant is that the return values are unbacked symbols. +# The data-dependent errors highlighted in this section are cases like the following, where `data-dependent guards `_ are encountered: -class DerivedDimExample2(torch.nn.Module): - def forward(self, z, y): - return z[1:] + y[1::3] +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + if a // 2 >= 5: + return y + 2 + else: + return y * 5 -foo = DerivedDimExample2() +###################################################################### +# Here we actually need the "hint", or the concrete value of ``a`` for the compiler to decide whether to trace ``return y + 2`` or ``return y * 5`` as the output. +# Because we trace with FakeTensors, we don't know what ``a // 2 >= 5`` actually evaluates to, and export errors out with "Could not guard on data-dependent expression ``u0 // 2 >= 5 (unhinted)``". +# +# So how do we export this toy model? Unlike ``torch.compile()``, export requires full graph compilation, and we can't just graph break on this. Here are some basic options: +# +# 1. Manual specialization: we could intervene by selecting the branch to trace, either by removing the control-flow code to contain only the specialized branch, or using ``torch.compiler.is_compiling()`` to guard what's traced at compile-time. +# 2. ``torch.cond()``: we could rewrite the control-flow code to use ``torch.cond()`` so we don't specialize on a branch. +# +# While these options are valid, they have their pitfalls. Option 1 sometimes requires drastic, invasive rewrites of the model code to specialize, and ``torch.cond()`` is not a comprehensive system for handling data-dependent errors. +# As we will see, there are data-dependent errors that do not involve control-flow. +# +# The generally recommended approach is to start with ``torch._check()`` calls. While these give the impression of purely being assert statements, they are in fact a system of informing the compiler on properties of symbols. +# While a ``torch._check()`` call does act as an assertion at runtime, when traced at compile-time, the checked expression is sent to the symbolic shapes subsystem for reasoning, and any symbol properties that follow from the expression being true, +# are stored as symbol properties (provided it's smart enough to infer those properties). So even if unbacked symbols don't have hints, if we're able to communicate properties that are generally true for these symbols via +# ``torch._check()`` calls, we can potentially bypass data-dependent guards without rewriting the offending model code. +# +# For example in the model above, inserting ``torch._check(a >= 10)`` would tell the compiler that ``y + 2`` can always be returned, and ``torch._check(a == 4)`` tells it to return ``y * 5``. +# See what happens when we re-export this model. -z, y = torch.randn(4), torch.randn(10) -dx = torch.export.Dim("dx", min=3, max=6) -dz = dx + 1 -dy = dx * 3 + 1 -derived_dynamic_shapes2 = ({0: dz}, {0: dy}) +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + torch._check(a >= 10) + torch._check(a <= 60) + if a // 2 >= 5: + return y + 2 + else: + return y * 5 -derived_dim_example2 = export(foo, (z, y), dynamic_shapes=derived_dynamic_shapes2) -print(derived_dim_example2.module()(torch.randn(7), torch.randn(19))) +inps = ( + torch.tensor(32), + torch.randn(4), +) +ep = export(Foo(), inps) +print(ep) ###################################################################### -# We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints -# are necessary. We can do this by relaxing all constraints (recall that if we -# do not provide constraints for a dimension, the default behavior is to constrain -# to the exact shape value of the example input) and letting ``torch.export`` -# error out. - -inp4 = torch.randn(8, 16) -inp5 = torch.randn(16, 32) +# Export succeeds, and note from the range constraints field that ``u0`` takes on a range of ``[10, 60]``. +# +# So what information do ``torch._check()`` calls actually communicate? This varies as the symbolic shapes subsystem gets smarter, but at a fundamental level, these are generally true: +# +# 1. Equality with non-data-dependent expressions: ``torch._check()`` calls that communicate equalities like ``u0 == s0 + 4`` or ``u0 == 5``. +# 2. Range refinement: calls that provide lower or upper bounds for symbols, like the above. +# 3. Some basic reasoning around more complicated expressions: inserting ``torch._check(a < 4)`` will typically tell the compiler that ``a >= 4`` is false. Checks on complex expressions like ``torch._check(a ** 2 - 3 * a <= 10)`` will typically get you past identical guards. +# +# As mentioned previously, ``torch._check()`` calls have applicability outside of data-dependent control flow. For example, here's a model where ``torch._check()`` insertion +# prevails while manual specialization & ``torch.cond()`` do not: -class DynamicShapesExample3(torch.nn.Module): +class Foo(torch.nn.Module): def forward(self, x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y - -dynamic_shapes3 = { - "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, - "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, -} + a = x.item() + return y[a] +inps = ( + torch.tensor(32), + torch.randn(60), +) try: - export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3) + export(Foo(), inps) except Exception: tb.print_exc() ###################################################################### -# We can see that the error message gives us suggested fixes to our -# dynamic shape constraints. Let us follow those suggestions (exact -# suggestions may differ slightly): +# Here is a scenario where ``torch._check()`` insertion is required simply to prevent an operation from failing. The export call will fail with +# "Could not guard on data-dependent expression ``-u0 > 60``", implying that the compiler doesn't know if this is a valid indexing operation - +# if the value of ``x`` is out-of-bounds for ``y`` or not. Here, manual specialization is too prohibitive, and ``torch.cond()`` has no place. +# Instead, informing the compiler of ``u0``'s range is sufficient: -def suggested_fixes(): - inp4_dim1 = Dim('shared_dim') - # suggested fixes below - inp4_dim0 = Dim('inp4_dim0', max=16) - inp5_dim1 = Dim('inp5_dim1', min=17) - inp5_dim0 = inp4_dim1 - # end of suggested fixes - return { - "x": {0: inp4_dim0, 1: inp4_dim1}, - "y": {0: inp5_dim0, 1: inp5_dim1}, - } +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + torch._check(a >= 0) + torch._check(a < y.shape[0]) + return y[a] -dynamic_shapes3_fixed = suggested_fixes() -exported_dynamic_shapes_example3 = export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) -print(exported_dynamic_shapes_example3.module()(torch.randn(4, 32), torch.randn(32, 64))) +inps = ( + torch.tensor(32), + torch.randn(60), +) +ep = export(Foo(), inps) +print(ep) ###################################################################### -# Note that in the example above, because we constrained the value of ``x.shape[0]`` in -# ``dynamic_shapes_example3``, the exported program is sound even though there is a -# raw ``if`` statement. +# Specialized values +# ^^^^^^^^^^^^^^^^^^ # -# If you want to see why ``torch.export`` generated these constraints, you can -# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -# or use ``torch._logging.set_logs``. - -import logging -torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) -exported_dynamic_shapes_example3 = export(DynamicShapesExample3(), (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) +# Another category of data-dependent error happens when the program attempts to extract a concrete data-dependent integer/float value +# while tracing. This looks something like "Could not extract specialized integer from data-dependent expression", and is analogous to +# the previous class of errors - if these occur when attempting to evaluate concrete integer/float values, data-dependent guard errors arise +# with evaluating concrete boolean values. +# +# This error typically occurs when there is an explicit or implicit ``int()`` cast on a data-dependent expression. For example, this list comprehension +# has a `range()` call that implicitly does an ``int()`` cast on the size of the list: -# reset to previous values -torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = torch.cat([y for y in range(a)], dim=0) + return b + int(a) + +inps = ( + torch.tensor(32), + torch.randn(60), +) +try: + export(Foo(), inps, strict=False) +except Exception: + tb.print_exc() ###################################################################### -# We can view an ``ExportedProgram``'s symbolic shape ranges using the -# ``range_constraints`` field. +# For these errors, some basic options you have are: +# +# 1. Avoid unnecessary ``int()`` cast calls, in this case the ``int(a)`` in the return statement. +# 2. Use ``torch._check()`` calls; unfortunately all you may be able to do in this case is specialize (with ``torch._check(a == 60)``). +# 3. Rewrite the offending code at a higher level. For example, the list comprehension is semantically a ``repeat()`` op, which doesn't involve an ``int()`` cast. The following rewrite avoids data-dependent errors: -print(exported_dynamic_shapes_example3.range_constraints) +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = y.unsqueeze(0).repeat(a, 1) + return b + a + +inps = ( + torch.tensor(32), + torch.randn(60), +) +ep = export(Foo(), inps, strict=False) +print(ep) + +###################################################################### +# Data-dependent errors can be much more involved, and there are many more options in your toolkit to deal with them: ``torch._check_is_size()``, ``guard_size_oblivious()``, or real-tensor tracing, as starters. +# For more in-depth guides, please refer to the `Export Programming Model `_, +# or `Dealing with GuardOnDataDependentSymNode errors `_. ###################################################################### # Custom Ops # ---------- # -# ``torch.export`` can export PyTorch programs with custom operators. +# ``torch.export`` can export PyTorch programs with custom operators. Please +# refer to `this page `__ +# on how to author a custom operator in either C++ or Python. # -# Currently, the steps to register a custom op for use by ``torch.export`` are: -# -# - Define the custom op using ``torch.library`` (`reference `__) -# as with any other custom op - -from torch.library import Library, impl, impl_abstract +# The following is an example of registering a custom operator in python to be +# used by ``torch.export``. The important thing to note is that the custom op +# must have a `FakeTensor kernel `__. -m = Library("my_custom_library", "DEF") - -m.define("custom_op(Tensor input) -> Tensor") - -@impl(m, "custom_op", "CompositeExplicitAutograd") -def custom_op(x): +@torch.library.custom_op("my_custom_library::custom_op", mutates_args={}) +def custom_op(x: torch.Tensor) -> torch.Tensor: print("custom_op called!") return torch.relu(x) -###################################################################### -# - Define a ``"Meta"`` implementation of the custom op that returns an empty -# tensor with the same shape as the expected output - -@impl_abstract("my_custom_library::custom_op") +@custom_op.register_fake def custom_op_meta(x): + # Returns an empty tensor with the same shape as the expected output return torch.empty_like(x) ###################################################################### -# - Call the custom op from the code you want to export using ``torch.ops`` +# Here is an example of exporting a program with the custom op. class CustomOpExample(torch.nn.Module): def forward(self, x): @@ -576,30 +814,27 @@ def forward(self, x): x = torch.cos(x) return x -###################################################################### -# - Export the code as before - exported_custom_op_example = export(CustomOpExample(), (torch.randn(3, 3),)) -exported_custom_op_example.graph_module.print_readable() +print(exported_custom_op_example) print(exported_custom_op_example.module()(torch.randn(3, 3))) ###################################################################### -# Note in the above outputs that the custom op is included in the exported graph. -# And when we call the exported graph as a function, the original custom op is called, -# as evidenced by the ``print`` call. -# -# If you have a custom operator implemented in C++, please refer to -# `this document `__ -# to make it compatible with ``torch.export``. +# Note that in the ``ExportedProgram``, the custom operator is included in the graph. ###################################################################### -# Decompositions -# -------------- +# IR/Decompositions +# ----------------- # -# The graph produced by ``torch.export`` by default returns a graph containing -# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 -# operators, all of which are functional, that is, they do not -# mutate or alias inputs. You can find a list of all ATen operators +# The graph produced by ``torch.export`` returns a graph containing only +# `ATen operators `__, which are the +# basic unit of computation in PyTorch. As there are over 3000 ATen operators, +# export provides a way to narrow down the operator set used in the graph based +# on certain characteristics, creating different IRs. +# +# By default, export produces the most generic IR which contains all ATen +# operators, including both functional and non-functional operators. A functional +# operator is one that does not contain any mutations or aliasing of the inputs. +# You can find a list of all ATen operators # `here `__ # and you can inspect if an operator is functional by checking # ``op._schema.is_mutable``, for example: @@ -608,77 +843,78 @@ def forward(self, x): print(torch.ops.aten.add_.Tensor._schema.is_mutable) ###################################################################### -# By default, the environment in which you want to run the exported graph -# should support all ~2000 of these operators. -# However, you can use the following API on the exported program -# if your specific environment is only able to support a subset of -# the ~2000 operators. -# -# .. code-block:: python -# -# def run_decompositions( -# self: ExportedProgram, -# decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] -# ) -> ExportedProgram -# -# ``run_decompositions`` takes in a decomposition table, which is a mapping of -# operators to a function specifying how to reduce, or decompose, that operator -# into an equivalent sequence of other ATen operators. -# -# The default decomposition table for ``run_decompositions`` is the -# `Core ATen decomposition table `__ -# which will decompose the all ATen operators to the -# `Core ATen Operator Set `__ -# which consists of only ~180 operators. +# This generic IR can be used to train in eager PyTorch Autograd. This IR can be +# more explicitly reached through the API ``torch.export.export_for_training``, +# which was introduced in PyTorch 2.5, but calling ``torch.export.export`` +# should produce the same graph as of PyTorch 2.6. -class M(torch.nn.Module): - def __init__(self): +class DecompExample(torch.nn.Module): + def __init__(self) -> None: super().__init__() - self.linear = torch.nn.Linear(3, 4) + self.conv = torch.nn.Conv2d(1, 3, 1, 1) + self.bn = torch.nn.BatchNorm2d(3) def forward(self, x): - return self.linear(x) + x = self.conv(x) + x = self.bn(x) + return (x,) + +ep_for_training = torch.export.export_for_training(DecompExample(), (torch.randn(1, 1, 3, 3),)) +print(ep_for_training.graph) -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) +###################################################################### +# We can then lower this exported program to an operator set which only contains +# functional ATen operators through the API ``run_decompositions``, which +# decomposes the ATen operators into the ones specified in the decomposition +# table, and functionalizes the graph. By specifying an empty set, we're only +# performing functionalization, and does not do any additional decompositions. +# This results in an IR which contains ~2000 operators (instead of the 3000 +# operators above), and is ideal for inference cases. -core_ir_ep = ep.run_decompositions() -print(core_ir_ep.graph) +ep_for_inference = ep_for_training.run_decompositions(decomp_table={}) +print(ep_for_inference.graph) ###################################################################### -# Notice that after running ``run_decompositions`` the -# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen -# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part -# of the Core ATen Opset. -# -# Most ATen operators already have decompositions, which are located -# `here `__. -# If you would like to use some of these existing decomposition functions, -# you can pass in a list of operators you would like to decompose to the -# `get_decompositions `__ -# function, which will return a decomposition table using existing -# decomposition implementations. +# As we can see, the previously mutable operator, +# ``torch.ops.aten.add_.default`` has now been replaced with +# ``torch.ops.aten.add.default``, a l operator. -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 4) +###################################################################### +# We can also further lower this exported program to an operator set which only +# contains the +# `Core ATen Operator Set `__, +# which is a collection of only ~180 operators. This IR is optimal for backends +# who do not want to reimplement all ATen operators. - def forward(self, x): - return self.linear(x) +from torch.export import default_decompositions + +core_aten_decomp_table = default_decompositions() +core_aten_ep = ep_for_training.run_decompositions(decomp_table=core_aten_decomp_table) +print(core_aten_ep.graph) + +###################################################################### +# We now see that ``torch.ops.aten.conv2d.default`` has been decomposed +# into ``torch.ops.aten.convolution.default``. This is because ``convolution`` +# is a more "core" operator, as operations like ``conv1d`` and ``conv2d`` can be +# implemented using the same op. + +###################################################################### +# We can also specify our own decomposition behaviors: + +my_decomp_table = torch.export.default_decompositions() -ep = export(M(), (torch.randn(2, 3),)) -print(ep.graph) +def my_awesome_custom_conv2d_function(x, weight, bias, stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1): + return 2 * torch.ops.aten.convolution(x, weight, bias, stride, padding, dilation, False, [0, 0], groups) -from torch._decomp import get_decompositions -decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) -core_ir_ep = ep.run_decompositions(decomp_table) -print(core_ir_ep.graph) +my_decomp_table[torch.ops.aten.conv2d.default] = my_awesome_custom_conv2d_function +my_ep = ep_for_training.run_decompositions(my_decomp_table) +print(my_ep.graph) ###################################################################### -# If there is no existing decomposition function for an ATen operator that you would -# like to decompose, feel free to send a pull request into PyTorch -# implementing the decomposition! +# Notice that instead of ``torch.ops.aten.conv2d.default`` being decomposed +# into ``torch.ops.aten.convolution.default``, it is now decomposed into +# ``torch.ops.aten.convolution.default`` and ``torch.ops.aten.mul.Tensor``, +# which matches our custom decomposition rule. ###################################################################### # ExportDB @@ -752,18 +988,18 @@ def forward(self, x): ###################################################################### # .. code-block:: python # -# import torch._export # import torch._inductor # # # Note: these APIs are subject to change -# # Compile the exported program to a .so using ``AOTInductor`` +# # Compile the exported program to a PT2 archive using ``AOTInductor`` # with torch.no_grad(): -# so_path = torch._inductor.aot_compile(ep.module(), [inp]) +# pt2_path = torch._inductor.aoti_compile_and_package(ep) # # # Load and run the .so file in Python. # # To load and run it in a C++ environment, see: # # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html -# res = torch._export.aot_load(so_path, device="cuda")(inp) +# aoti_compiled = torch._inductor.aoti_load_package(pt2_path) +# res = aoti_compiled(inp) ###################################################################### # Conclusion diff --git a/intermediate_source/torchrec_intro_tutorial.py b/intermediate_source/torchrec_intro_tutorial.py new file mode 100644 index 00000000000..81b7663c110 --- /dev/null +++ b/intermediate_source/torchrec_intro_tutorial.py @@ -0,0 +1,950 @@ +""" +Introduction to TorchRec +================================== + +**TorchRec** is a PyTorch library tailored for building scalable and efficient recommendation systems using embeddings. +This tutorial guides you through the installation process, introduces the concept of embeddings, and highlights their importance in +recommendation systems. It offers practical demonstrations on implementing embeddings with PyTorch +and TorchRec, focusing on handling large embedding tables through distributed training and advanced optimizations. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Fundamentals of embeddings and their role in recommendation systems + * How to set up TorchRec to manage and implement embeddings in PyTorch environments + * Explore advanced techniques for distributing large embedding tables across multiple GPUs + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.5 or later with CUDA 11.8 or later + * Python 3.9 or later + * `FBGEMM `__ + + +""" + +############################################### +# Install Dependencies +# ^^^^^^^^^^^^^^^^^^^^ +# +# Before running this tutorial in Google Colab, make sure to install the +# following dependencies: +# +# .. code-block:: sh +# +# !pip3 install --pre torch --index-url https://download.pytorch.org/whl/cu121 -U +# !pip3 install fbgemm_gpu --index-url https://download.pytorch.org/whl/cu121 +# !pip3 install torchmetrics==1.0.3 +# !pip3 install torchrec --index-url https://download.pytorch.org/whl/cu121 +# +# .. note:: +# If you are running this in Google Colab, make sure to switch to a GPU runtime type. +# For more information, +# see `Enabling CUDA `__ +# + + +###################################################################### +# Embeddings +# ~~~~~~~~~~ +# +# When building recommendation systems, categorical features typically +# have massive cardinality, posts, users, ads, and so on. +# +# In order to represent these entities and model these relationships, +# **embeddings** are used. In machine learning, **embeddings are a vectors +# of real numbers in a high-dimensional space used to represent meaning in +# complex data like words, images, or users**. +# +# Embeddings in RecSys +# ~~~~~~~~~~~~~~~~~~~~ +# +# Now you might wonder, how are these embeddings generated in the first +# place? Well, embeddings are represented as individual rows in an +# **Embedding Table**, also referred to as embedding weights. The reason +# for this is that embeddings or embedding table weights are trained just +# like all of the other weights of the model via gradient descent! +# +# Embedding tables are simply a large matrix for storing embeddings, with +# two dimensions (B, N), where: +# +# * B is the number of embeddings stored by the table +# * N is the number of dimensions per embedding (N-dimensional embedding). +# +# The inputs to embedding tables represent embedding lookups to retrieve +# the embedding for a specific index or row. In recommendation systems, such +# as those used in many large systems, unique IDs are not only used for +# specific users, but also across entities like posts and ads to serve as +# lookup indices to respective embedding tables! +# +# Embeddings are trained in RecSys through the following process: +# +# * **Input/lookup indices are fed into the model, as unique IDs**. IDs are +# hashed to the total size of the embedding table to prevent issues when +# the ID > number of rows +# +# * Embeddings are then retrieved and **pooled, such as taking the sum or +# mean of the embeddings**. This is required as there can be a variable number of +# embeddings per example while the model expects consistent shapes. +# +# * The **embeddings are used in conjunction with the rest of the model to +# produce a prediction**, such as `Click-Through Rate +# (CTR) `__ +# for an ad. +# +# * The loss is calculated with the prediction and the label +# for an example, and **all weights of the model are updated through +# gradient descent and backpropagation, including the embedding weights** +# that were associated with the example. +# +# These embeddings are crucial for representing categorical features, such +# as users, posts, and ads, in order to capture relationships and make +# good recommendations. The `Deep learning recommendation +# model `__ (DLRM) paper talks more +# about the technical details of using embedding tables in RecSys. +# +# This tutorial introduces the concept of embeddings, showcase +# TorchRec specific modules and data types, and depict how distributed training +# works with TorchRec. +# + +import torch + + +###################################################################### +# Embeddings in PyTorch +# --------------------- +# +# In PyTorch, we have the following types of embeddings: +# +# * :class:`torch.nn.Embedding`: An embedding table where forward pass returns the +# embeddings themselves as is. +# +# * :class:`torch.nn.EmbeddingBag`: Embedding table where forward pass returns +# embeddings that are then pooled, for example, sum or mean, otherwise known +# as **Pooled Embeddings**. +# +# In this section, we will go over a very brief introduction to performing +# embedding lookups by passing in indices into the table. +# + +num_embeddings, embedding_dim = 10, 4 + +# Initialize our embedding table +weights = torch.rand(num_embeddings, embedding_dim) +print("Weights:", weights) + +# Pass in pre-generated weights just for example, typically weights are randomly initialized +embedding_collection = torch.nn.Embedding( + num_embeddings, embedding_dim, _weight=weights +) +embedding_bag_collection = torch.nn.EmbeddingBag( + num_embeddings, embedding_dim, _weight=weights +) + +# Print out the tables, we should see the same weights as above +print("Embedding Collection Table: ", embedding_collection.weight) +print("Embedding Bag Collection Table: ", embedding_bag_collection.weight) + +# Lookup rows (ids for embedding ids) from the embedding tables +# 2D tensor with shape (batch_size, ids for each batch) +ids = torch.tensor([[1, 3]]) +print("Input row IDS: ", ids) + +embeddings = embedding_collection(ids) + +# Print out the embedding lookups +# You should see the specific embeddings be the same as the rows (ids) of the embedding tables above +print("Embedding Collection Results: ") +print(embeddings) +print("Shape: ", embeddings.shape) + +# ``nn.EmbeddingBag`` default pooling is mean, so should be mean of batch dimension of values above +pooled_embeddings = embedding_bag_collection(ids) + +print("Embedding Bag Collection Results: ") +print(pooled_embeddings) +print("Shape: ", pooled_embeddings.shape) + +# ``nn.EmbeddingBag`` is the same as ``nn.Embedding`` but just with pooling (mean, sum, and so on) +# We can see that the mean of the embeddings of embedding_collection is the same as the output of the embedding_bag_collection +print("Mean: ", torch.mean(embedding_collection(ids), dim=1)) + + +###################################################################### +# Congratulations! Now you have a basic understanding of how to use +# embedding tables --- one of the foundations of modern recommendation +# systems! These tables represent entities and their relationships. For +# example, the relationship between a given user and the pages and posts +# they have liked. +# + + +###################################################################### +# TorchRec Features Overview +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# In the section above we've learned how to use embedding tables, one of the foundations of +# modern recommendation systems! These tables represent entities and +# relationships, such as users, pages, posts, etc. Given that these +# entities are always increasing, a **hash** function is typically applied +# to make sure the IDs are within the bounds of a certain embedding table. +# However, in order to represent a vast amount of entities and reduce hash +# collisions, these tables can become quite massive (think about the number of ads +# for example). In fact, these tables can become so massive that they +# won't be able to fit on 1 GPU, even with 80G of memory. +# +# In order to train models with massive embedding tables, sharding these +# tables across GPUs is required, which then introduces a whole new set of +# problems and opportunities in parallelism and optimization. Luckily, we have +# the TorchRec library `__ that has encountered, consolidated, and addressed +# many of these concerns. TorchRec serves as a **library that provides +# primitives for large scale distributed embeddings**. +# +# Next, we will explore the major features of the TorchRec +# library. We will start with ``torch.nn.Embedding`` and will extend that to +# custom TorchRec modules, explore distributed training environment with +# generating a sharding plan for embeddings, look at inherent TorchRec +# optimizations, and extend the model to be ready for inference in C++. +# Below is a quick outline of what this section consists of: +# +# * TorchRec Modules and Data Types +# * Distributed Training, Sharding, and Optimizations +# +# Let's begin with importing TorchRec: + +import torchrec + + +###################################################################### +# TorchRec Modules and Data Types +# ---------------------------------- +# +# This section goes over TorchRec Modules and data types including such +# entities as ``EmbeddingCollection`` and ``EmbeddingBagCollection``, +# ``JaggedTensor``, ``KeyedJaggedTensor``, ``KeyedTensor`` and more. +# +# From ``EmbeddingBag`` to ``EmbeddingBagCollection`` +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have already explored :class:`torch.nn.Embedding` and :class:`torch.nn.EmbeddingBag`. +# TorchRec extends these modules by creating collections of embeddings, in +# other words modules that can have multiple embedding tables, with +# ``EmbeddingCollection`` and ``EmbeddingBagCollection`` +# We will use ``EmbeddingBagCollection`` to represent a group of +# embedding bags. +# +# In the example code below, we create an ``EmbeddingBagCollection`` (EBC) +# with two embedding bags, 1 representing **products** and 1 representing **users**. +# Each table, ``product_table`` and ``user_table``, is represented by a 64 dimension +# embedding of size 4096. +# + +ebc = torchrec.EmbeddingBagCollection( + device="cpu", + tables=[ + torchrec.EmbeddingBagConfig( + name="product_table", + embedding_dim=64, + num_embeddings=4096, + feature_names=["product"], + pooling=torchrec.PoolingType.SUM, + ), + torchrec.EmbeddingBagConfig( + name="user_table", + embedding_dim=64, + num_embeddings=4096, + feature_names=["user"], + pooling=torchrec.PoolingType.SUM, + ), + ], +) +print(ebc.embedding_bags) + + +###################################################################### +# Let’s inspect the forward method for ``EmbeddingBagCollection`` and the +# module’s inputs and outputs: +# + +import inspect + +# Let's look at the ``EmbeddingBagCollection`` forward method +# What is a ``KeyedJaggedTensor`` and ``KeyedTensor``? +print(inspect.getsource(ebc.forward)) + + +###################################################################### +# TorchRec Input/Output Data Types +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# TorchRec has distinct data types for input and output of its modules: +# ``JaggedTensor``, ``KeyedJaggedTensor``, and ``KeyedTensor``. Now you +# might ask, why create new data types to represent sparse features? To +# answer that question, we must understand how sparse features are +# represented in code. +# +# Sparse features are otherwise known as ``id_list_feature`` and +# ``id_score_list_feature``, and are the **IDs** that will be used as +# indices to an embedding table to retrieve the embedding for that ID. To +# give a very simple example, imagine a single sparse feature being Ads +# that a user interacted with. The input itself would be a set of Ad IDs +# that a user interacted with, and the embeddings retrieved would be a +# semantic representation of those Ads. The tricky part of representing +# these features in code is that in each input example, **the number of +# IDs is variable**. One day a user might have interacted with only one ad +# while the next day they interact with three. +# +# A simple representation is shown below, where we have a ``lengths`` +# tensor denoting how many indices are in an example for a batch and a +# ``values`` tensor containing the indices themselves. +# + +# Batch Size 2 +# 1 ID in example 1, 2 IDs in example 2 +id_list_feature_lengths = torch.tensor([1, 2]) + +# Values (IDs) tensor: ID 5 is in example 1, ID 7, 1 is in example 2 +id_list_feature_values = torch.tensor([5, 7, 1]) + + +###################################################################### +# Next, let's look at the offsets as well as what is contained in each batch +# + +# Lengths can be converted to offsets for easy indexing of values +id_list_feature_offsets = torch.cumsum(id_list_feature_lengths, dim=0) + +print("Offsets: ", id_list_feature_offsets) +print("First Batch: ", id_list_feature_values[: id_list_feature_offsets[0]]) +print( + "Second Batch: ", + id_list_feature_values[id_list_feature_offsets[0] : id_list_feature_offsets[1]], +) + +from torchrec import JaggedTensor + +# ``JaggedTensor`` is just a wrapper around lengths/offsets and values tensors! +jt = JaggedTensor(values=id_list_feature_values, lengths=id_list_feature_lengths) + +# Automatically compute offsets from lengths +print("Offsets: ", jt.offsets()) + +# Convert to list of values +print("List of Values: ", jt.to_dense()) + +# ``__str__`` representation +print(jt) + +from torchrec import KeyedJaggedTensor + +# ``JaggedTensor`` represents IDs for 1 feature, but we have multiple features in an ``EmbeddingBagCollection`` +# That's where ``KeyedJaggedTensor`` comes in! ``KeyedJaggedTensor`` is just multiple ``JaggedTensors`` for multiple id_list_feature_offsets +# From before, we have our two features "product" and "user". Let's create ``JaggedTensors`` for both! + +product_jt = JaggedTensor( + values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1]) +) +user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2])) + +# Q1: How many batches are there, and which values are in the first batch for ``product_jt`` and ``user_jt``? +kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt}) + +# Look at our feature keys for the ``KeyedJaggedTensor`` +print("Keys: ", kjt.keys()) + +# Look at the overall lengths for the ``KeyedJaggedTensor`` +print("Lengths: ", kjt.lengths()) + +# Look at all values for ``KeyedJaggedTensor`` +print("Values: ", kjt.values()) + +# Can convert ``KeyedJaggedTensor`` to dictionary representation +print("to_dict: ", kjt.to_dict()) + +# ``KeyedJaggedTensor`` string representation +print(kjt) + +# Q2: What are the offsets for the ``KeyedJaggedTensor``? + +# Now we can run a forward pass on our ``EmbeddingBagCollection`` from before +result = ebc(kjt) +result + +# Result is a ``KeyedTensor``, which contains a list of the feature names and the embedding results +print(result.keys()) + +# The results shape is [2, 128], as batch size of 2. Reread previous section if you need a refresher on how the batch size is determined +# 128 for dimension of embedding. If you look at where we initialized the ``EmbeddingBagCollection``, we have two tables "product" and "user" of dimension 64 each +# meaning embeddings for both features are of size 64. 64 + 64 = 128 +print(result.values().shape) + +# Nice to_dict method to determine the embeddings that belong to each feature +result_dict = result.to_dict() +for key, embedding in result_dict.items(): + print(key, embedding.shape) + + +###################################################################### +# Congrats! You now understand TorchRec modules and data types. +# Give yourself a pat on the back for making it this far. Next, we will +# learn about distributed training and sharding. +# + + +###################################################################### +# Distributed Training and Sharding +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Now that we have a grasp on TorchRec modules and data types, it's time +# to take it to the next level. +# +# Remember, the main purpose of TorchRec is to provide primitives for +# distributed embeddings. So far, we've only worked with embedding tables +# on a single device. This has been possible given how small the embedding tables +# have been, but in a production setting this isn't generally the case. +# Embedding tables often get massive, where one table can't fit on a single +# GPU, creating the requirement for multiple devices and a distributed +# environment. +# +# In this section, we will explore setting up a distributed environment, +# exactly how actual production training is done, and explore sharding +# embedding tables, all with TorchRec. +# +# **This section will also only use 1 GPU, though it will be treated in a +# distributed fashion. This is only a limitation for training, as training +# has a process per GPU. Inference does not run into this requirement** +# +# In the example code below, we set up our PyTorch distributed environment. +# +# .. warning:: +# If you are running this in Google Colab, you can only call this cell once, +# calling it again will cause an error as you can only initialize the process +# group once. + +import os + +import torch.distributed as dist + +# Set up environment variables for distributed training +# RANK is which GPU we are on, default 0 +os.environ["RANK"] = "0" +# How many devices in our "world", colab notebook can only handle 1 process +os.environ["WORLD_SIZE"] = "1" +# Localhost as we are training locally +os.environ["MASTER_ADDR"] = "localhost" +# Port for distributed training +os.environ["MASTER_PORT"] = "29500" + +# nccl backend is for GPUs, gloo is for CPUs +dist.init_process_group(backend="gloo") + +print(f"Distributed environment initialized: {dist}") + + +###################################################################### +# Distributed Embeddings +# ~~~~~~~~~~~~~~~~~~~~~~ +# +# We have already worked with the main TorchRec module: +# ``EmbeddingBagCollection``. We have examined how it works along with how +# data is represented in TorchRec. However, we have not yet explored one +# of the main parts of TorchRec, which is **distributed embeddings**. +# +# GPUs are the most popular choice for ML workloads by far today, as they +# are able to do magnitudes more floating point operations/s +# (`FLOPs `__) than CPU. However, +# GPUs come with the limitation of scarce fast memory (HBM which is +# analogous to RAM for CPU), typically, ~10s of GBs. +# +# A RecSys model can contain embedding tables that far exceed the memory +# limit for 1 GPU, hence the need for distribution of the embedding tables +# across multiple GPUs, otherwise known as **model parallel**. On the +# other hand, **data parallel** is where the entire model is replicated on +# each GPU, which each GPU taking in a distinct batch of data for +# training, syncing gradients on the backwards pass. +# +# Parts of the model that **require less compute but more memory +# (embeddings) are distributed with model parallel** while parts that +# **require more compute and less memory (dense layers, MLP, etc.) are +# distributed with data parallel**. +# +# Sharding +# ~~~~~~~~ +# +# In order to distribute an embedding table, we split up the embedding +# table into parts and place those parts onto different devices, also +# known as “sharding”. +# +# There are many ways to shard embedding tables. The most common ways are: +# +# * Table-Wise: the table is placed entirely onto one device +# * Column-Wise: columns of embedding tables are sharded +# * Row-Wise: rows of embedding tables are sharded +# +# Sharded Modules +# ~~~~~~~~~~~~~~~ +# +# While all of this seems like a lot to deal with and implement, you're in +# luck. **TorchRec provides all the primitives for easy distributed +# training and inference**! In fact, TorchRec modules have two corresponding +# classes for working with any TorchRec module in a distributed +# environment: +# +# * **The module sharder**: This class exposes a ``shard`` API +# that handles sharding a TorchRec Module, producing a sharded module. +# * For ``EmbeddingBagCollection``, the sharder is `EmbeddingBagCollectionSharder ` +# * **Sharded module**: This class is a sharded variant of a TorchRec module. +# It has the same input/output as a the regular TorchRec module, but much +# more optimized and works in a distributed environment. +# * For ``EmbeddingBagCollection``, the sharded variant is `ShardedEmbeddingBagCollection` +# +# Every TorchRec module has an unsharded and sharded variant. +# +# * The unsharded version is meant to be prototyped and experimented with. +# * The sharded version is meant to be used in a distributed environment for +# distributed training and inference. +# +# The sharded versions of TorchRec modules, for example +# ``EmbeddingBagCollection``, will handle everything that is needed for Model +# Parallelism, such as communication between GPUs for distributing +# embeddings to the correct GPUs. +# +# Refresher of our ``EmbeddingBagCollection`` module +ebc + +from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder +from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology +from torchrec.distributed.types import ShardingEnv + +# Corresponding sharder for ``EmbeddingBagCollection`` module +sharder = EmbeddingBagCollectionSharder() + +# ``ProcessGroup`` from torch.distributed initialized 2 cells above +pg = dist.GroupMember.WORLD +assert pg is not None, "Process group is not initialized" + +print(f"Process Group: {pg}") + + +###################################################################### +# Planner +# ~~~~~~~ +# +# Before we can show how sharding works, we must know about the +# **planner**, which helps us determine the best sharding configuration. +# +# Given a number of embedding tables and a number of ranks, there are many +# different sharding configurations that are possible. For example, given +# 2 embedding tables and 2 GPUs, you can: +# +# * Place 1 table on each GPU +# * Place both tables on a single GPU and no tables on the other +# * Place certain rows and columns on each GPU +# +# Given all of these possibilities, we typically want a sharding +# configuration that is optimal for performance. +# +# That is where the planner comes in. The planner is able to determine +# given the number of embedding tables and the number of GPUs, what is the optimal +# configuration. Turns out, this is incredibly difficult to do manually, +# with tons of factors that engineers have to consider to ensure an +# optimal sharding plan. Luckily, TorchRec provides an auto planner when +# the planner is used. +# +# The TorchRec planner: +# +# * Assesses memory constraints of hardware +# * Estimates compute based on memory fetches as embedding lookups +# * Addresses data specific factors +# * Considers other hardware specifics like bandwidth to generate an optimal sharding plan +# +# In order to take into consideration all these variables, The TorchRec +# planner can take in `various amounts of data for embedding tables, +# constraints, hardware information, and +# topology `__ +# to aid in generating the optimal sharding plan for a model, which is +# routinely provided across stacks. +# +# To learn more about sharding, see our `sharding +# tutorial `__. +# + +# In our case, 1 GPU and compute on CUDA device +planner = EmbeddingShardingPlanner( + topology=Topology( + world_size=1, + compute_device="cuda", + ) +) + +# Run planner to get plan for sharding +plan = planner.collective_plan(ebc, [sharder], pg) + +print(f"Sharding Plan generated: {plan}") + + +###################################################################### +# Planner Result +# ~~~~~~~~~~~~~~ +# +# As you can see above, when running the planner there is quite a bit of output. +# We can see a lot of stats being calculated along with where our +# tables end up being placed. +# +# The result of running the planner is a static plan, which can be reused +# for sharding! This allows sharding to be static for production models +# instead of determining a new sharding plan everytime. Below, we use the +# sharding plan to finally generate our ``ShardedEmbeddingBagCollection``. +# + +# The static plan that was generated +plan + +env = ShardingEnv.from_process_group(pg) + +# Shard the ``EmbeddingBagCollection`` module using the ``EmbeddingBagCollectionSharder`` +sharded_ebc = sharder.shard(ebc, plan.plan[""], env, torch.device("cuda")) + +print(f"Sharded EBC Module: {sharded_ebc}") + + +###################################################################### +# GPU Training with ``LazyAwaitable`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Remember that TorchRec is a highly optimized library for distributed +# embeddings. A concept that TorchRec introduces to enable higher +# performance for training on GPU is a +# `LazyAwaitable `. +# You will see ``LazyAwaitable`` types as outputs of various sharded +# TorchRec modules. All a ``LazyAwaitable`` type does is delay calculating some +# result as long as possible, and it does it by acting like an async type. +# + +from typing import List + +from torchrec.distributed.types import LazyAwaitable + + +# Demonstrate a ``LazyAwaitable`` type: +class ExampleAwaitable(LazyAwaitable[torch.Tensor]): + def __init__(self, size: List[int]) -> None: + super().__init__() + self._size = size + + def _wait_impl(self) -> torch.Tensor: + return torch.ones(self._size) + + +awaitable = ExampleAwaitable([3, 2]) +awaitable.wait() + +kjt = kjt.to("cuda") +output = sharded_ebc(kjt) +# The output of our sharded ``EmbeddingBagCollection`` module is an `Awaitable`? +print(output) + +kt = output.wait() +# Now we have our ``KeyedTensor`` after calling ``.wait()`` +# If you are confused as to why we have a ``KeyedTensor ``output, +# give yourself a refresher on the unsharded ``EmbeddingBagCollection`` module +print(type(kt)) + +print(kt.keys()) + +print(kt.values().shape) + +# Same output format as unsharded ``EmbeddingBagCollection`` +result_dict = kt.to_dict() +for key, embedding in result_dict.items(): + print(key, embedding.shape) + + +###################################################################### +# Anatomy of Sharded TorchRec modules +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have now successfully sharded an ``EmbeddingBagCollection`` given a +# sharding plan that we generated! The sharded module has common APIs from +# TorchRec which abstract away distributed communication/compute amongst +# multiple GPUs. In fact, these APIs are highly optimized for performance +# in training and inference. **Below are the three common APIs for +# distributed training/inference** that are provided by TorchRec: +# +# * ``input_dist``: Handles distributing inputs from GPU to GPU. +# * ``lookups``: Does the actual embedding lookup in an optimized, +# batched manner using FBGEMM TBE (more on this later). +# * ``output_dist``: Handles distributing outputs from GPU to GPU. +# +# The distribution of inputs and outputs is done through `NCCL +# Collectives `__, +# namely +# `All-to-Alls `__, +# which is where all GPUs send and receive data to and from one another. +# TorchRec interfaces with PyTorch distributed for collectives and +# provides clean abstractions to the end users, removing the concern for +# the lower level details. +# +# The backwards pass does all of these collectives but in the reverse +# order for distribution of gradients. ``input_dist``, ``lookup``, and +# ``output_dist`` all depend on the sharding scheme. Since we sharded in a +# table-wise fashion, these APIs are modules that are constructed by +# `TwPooledEmbeddingSharding`. +# + +sharded_ebc + +# Distribute input KJTs to all other GPUs and receive KJTs +sharded_ebc._input_dists + +# Distribute output embeddings to all other GPUs and receive embeddings +sharded_ebc._output_dists + + +###################################################################### +# Optimizing Embedding Lookups +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In performing lookups for a collection of embedding tables, a trivial +# solution would be to iterate through all the ``nn.EmbeddingBags`` and do +# a lookup per table. This is exactly what the standard, unsharded +# ``EmbeddingBagCollection`` does. However, while this solution +# is simple, it is extremely slow. +# +# `FBGEMM `__ is a +# library that provides GPU operators (otherwise known as kernels) that +# are very optimized. One of these operators is known as **Table Batched +# Embedding** (TBE), provides two major optimizations: +# +# - Table batching, which allows you to look up multiple embeddings with +# one kernel call. +# - Optimizer Fusion, which allows the module to update itself given the +# canonical pytorch optimizers and arguments. +# +# The ``ShardedEmbeddingBagCollection`` uses the FBGEMM TBE as the lookup +# instead of traditional ``nn.EmbeddingBags`` for optimized embedding +# lookups. +# + +sharded_ebc._lookups + + +###################################################################### +# ``DistributedModelParallel`` +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have now explored sharding a single ``EmbeddingBagCollection``! We were +# able to take the ``EmbeddingBagCollectionSharder`` and use the unsharded +# ``EmbeddingBagCollection`` to generate a +# ``ShardedEmbeddingBagCollection`` module. This workflow is fine, but +# typically when implementing model parallel, +# `DistributedModelParallel` +# (DMP) is used as the standard interface. When wrapping your model (in +# our case ``ebc``), with DMP, the following will occur: +# +# 1. Decide how to shard the model. DMP will collect the available +# sharders and come up with a plan of the optimal way to shard the +# embedding table(s) (for example, ``EmbeddingBagCollection``) +# 2. Actually shard the model. This includes allocating memory for each +# embedding table on the appropriate device(s). +# +# DMP takes in everything that we've just experimented with, like a static +# sharding plan, a list of sharders, etc. However, it also has some nice +# defaults to seamlessly shard a TorchRec model. In this toy example, +# since we have two embedding tables and one GPU, TorchRec will place both +# on the single GPU. +# + +ebc + +model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) + +out = model(kjt) +out.wait() + +model + + +from fbgemm_gpu.split_embedding_configs import EmbOptimType + +###################################################################### +# Sharding Best Practices +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# Currently, our configuration is only sharding on 1 GPU (or rank), which +# is trivial: just place all the tables on 1 GPUs memory. However, in real +# production use cases, embedding tables are **typically sharded on +# hundreds of GPUs**, with different sharding methods such as table-wise, +# row-wise, and column-wise. It is incredibly important to determine a +# proper sharding configuration (to prevent out of memory issues) while +# keeping it balanced not only in terms of memory but also compute for +# optimal performance. +# + + +###################################################################### +# Adding in the Optimizer +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# Remember that TorchRec modules are hyperoptimized for large scale +# distributed training. An important optimization is in regards to the +# optimizer. +# +# TorchRec modules provide a seamless API to fuse the +# backwards pass and optimize step in training, providing a significant +# optimization in performance and decreasing the memory used, alongside +# granularity in assigning distinct optimizers to distinct model +# parameters. +# +# Optimizer Classes +# ^^^^^^^^^^^^^^^^^ +# +# TorchRec uses ``CombinedOptimizer``, which contains a collection of +# ``KeyedOptimizers``. A ``CombinedOptimizer`` effectively makes it easy +# to handle multiple optimizers for various sub groups in the model. A +# ``KeyedOptimizer`` extends the ``torch.optim.Optimizer`` and is +# initialized through a dictionary of parameters exposes the parameters. +# Each ``TBE`` module in a ``EmbeddingBagCollection`` will have it's own +# ``KeyedOptimizer`` which combines into one ``CombinedOptimizer``. +# +# Fused optimizer in TorchRec +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Using ``DistributedModelParallel``, the **optimizer is fused, which +# means that the optimizer update is done in the backward**. This is an +# optimization in TorchRec and FBGEMM, where the optimizer embedding +# gradients are not materialized and applied directly to the parameters. +# This brings significant memory savings as embedding gradients are +# typically size of the parameters themselves. +# +# You can, however, choose to make the optimizer ``dense`` which does not +# apply this optimization and let's you inspect the embedding gradients or +# apply computations to it as you wish. A dense optimizer in this case +# would be your `canonical PyTorch model training loop with +# optimizer. `__ +# +# Once the optimizer is created through ``DistributedModelParallel``, you +# still need to manage an optimizer for the other parameters not +# associated with TorchRec embedding modules. To find the other +# parameters, +# use ``in_backward_optimizer_filter(model.named_parameters())``. +# Apply an optimizer to those parameters as you would a normal Torch +# optimizer and combine this and the ``model.fused_optimizer`` into one +# ``CombinedOptimizer`` that you can use in your training loop to +# ``zero_grad`` and ``step`` through. +# +# Adding an Optimizer to ``EmbeddingBagCollection`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We will do this in two ways, which are equivalent, but give you options +# depending on your preferences: +# +# 1. Passing optimizer kwargs through ``fused_params`` in sharder. +# 2. Through ``apply_optimizer_in_backward``, which converts the optimizer +# parameters to ``fused_params`` to pass to the ``TBE`` in the ``EmbeddingBagCollection`` or ``EmbeddingCollection``. +# + +# Option 1: Passing optimizer kwargs through fused parameters +from torchrec.optim.optimizers import in_backward_optimizer_filter + + +# We initialize the sharder with +fused_params = { + "optimizer": EmbOptimType.EXACT_ROWWISE_ADAGRAD, + "learning_rate": 0.02, + "eps": 0.002, +} + +# Initialize sharder with ``fused_params`` +sharder_with_fused_params = EmbeddingBagCollectionSharder(fused_params=fused_params) + +# We'll use same plan and unsharded EBC as before but this time with our new sharder +sharded_ebc_fused_params = sharder_with_fused_params.shard( + ebc, plan.plan[""], env, torch.device("cuda") +) + +# Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correctly. +# If seen, we can also look at the TBE logs of the cell to see that our new optimizer is indeed being applied +print(f"Original Sharded EBC fused optimizer: {sharded_ebc.fused_optimizer}") +print( + f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}" +) + +print(f"Type of optimizer: {type(sharded_ebc_fused_params.fused_optimizer)}") + +import copy + +from torch.distributed.optim import ( + _apply_optimizer_in_backward as apply_optimizer_in_backward, +) + +# Option 2: Applying optimizer through apply_optimizer_in_backward +# Note: we need to call apply_optimizer_in_backward on unsharded model first and then shard it + +# We can achieve the same result as we did in the previous +ebc_apply_opt = copy.deepcopy(ebc) +optimizer_kwargs = {"lr": 0.5} + +for name, param in ebc_apply_opt.named_parameters(): + print(f"{name=}") + apply_optimizer_in_backward(torch.optim.SGD, [param], optimizer_kwargs) + +sharded_ebc_apply_opt = sharder.shard( + ebc_apply_opt, plan.plan[""], env, torch.device("cuda") +) + +# Now when we print the optimizer, we will see our new learning rate, you can verify momentum through the TBE logs as well if outputted +print(sharded_ebc_apply_opt.fused_optimizer) +print(type(sharded_ebc_apply_opt.fused_optimizer)) + +# We can also check through the filter other parameters that aren't associated with the "fused" optimizer(s) +# Practically, just non TorchRec module parameters. Since our module is just a TorchRec EBC +# there are no other parameters that aren't associated with TorchRec +print("Non Fused Model Parameters:") +print( + dict( + in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters()) + ).keys() +) + +# Here we do a dummy backwards call and see that parameter updates for fused +# optimizers happen as a result of the backward pass + +ebc_output = sharded_ebc_fused_params(kjt).wait().values() +loss = torch.sum(torch.ones_like(ebc_output) - ebc_output) +print(f"First Iteration Loss: {loss}") + +loss.backward() + +ebc_output = sharded_ebc_fused_params(kjt).wait().values() +loss = torch.sum(torch.ones_like(ebc_output) - ebc_output) +# We don't call an optimizer.step(), so for the loss to have changed here, +# that means that the gradients were somehow updated, which is what the +# fused optimizer automatically handles for us +print(f"Second Iteration Loss: {loss}") + + +###################################################################### +# Conclusion +# ^^^^^^^^^^ +# In this tutorial, you have done training a distributed RecSys model +# If you are interested in the inference the `TorchRec repo +# `__ has a +# full example of how to run the TorchRec in Inference mode. +# + + +###################################################################### +# See Also +# -------------- +# +# For more information, please see our +# `dlrm `__ +# example, which includes multinode training on the Criteo 1TB +# dataset using the methods described in `Deep Learning Recommendation Model +# for Personalization and Recommendation Systems `__. +# diff --git a/intermediate_source/torchrec_tutorial.rst b/intermediate_source/torchrec_tutorial.rst index 6a450b16591..883ca11087a 100644 --- a/intermediate_source/torchrec_tutorial.rst +++ b/intermediate_source/torchrec_tutorial.rst @@ -1,244 +1,10 @@ Introduction to TorchRec ======================== -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `__. - This will allow you to experiment with the information presented below. - -Follow along with the video below or on `youtube `__. +There is a newer tutorial on this topic. -.. raw:: html - -
- -
- -When building recommendation systems, we frequently want to represent -entities like products or pages with embeddings. For example, see Meta -AI’s `Deep learning recommendation -model `__, or DLRM. As the number of -entities grow, the size of the embedding tables can exceed a single -GPU’s memory. A common practice is to shard the embedding table across -devices, a type of model parallelism. To that end, TorchRec introduces -its primary API -called |DistributedModelParallel|_, -or DMP. Like PyTorch’s DistributedDataParallel, DMP wraps a model to -enable distributed training. - -Installation ------------- - -Requirements: python >= 3.7 - -We highly recommend CUDA when using TorchRec (If using CUDA: cuda >= 11.0). - - -.. code:: shell - - # install pytorch with cudatoolkit 11.3 - conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y - # install TorchRec - pip3 install torchrec-nightly - - -Overview --------- - -This tutorial will cover three pieces of TorchRec: the ``nn.module`` |EmbeddingBagCollection|_, the |DistributedModelParallel|_ API, and -the datastructure |KeyedJaggedTensor|_. - - -Distributed Setup -~~~~~~~~~~~~~~~~~ - -We setup our environment with torch.distributed. For more info on -distributed, see this -`tutorial `__. - -Here, we use one rank (the colab process) corresponding to our 1 colab -GPU. - -.. code:: python - - import os - import torch - import torchrec - import torch.distributed as dist - - os.environ["RANK"] = "0" - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "29500" - - # Note - you will need a V100 or A100 to run tutorial as as! - # If using an older GPU (such as colab free K80), - # you will need to compile fbgemm with the appripriate CUDA architecture - # or run with "gloo" on CPUs - dist.init_process_group(backend="nccl") - - -From EmbeddingBag to EmbeddingBagCollection -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch represents embeddings through |torch.nn.Embedding|_ and |torch.nn.EmbeddingBag|_. -EmbeddingBag is a pooled version of Embedding. - -TorchRec extends these modules by creating collections of embeddings. We -will use |EmbeddingBagCollection|_ to represent a group of EmbeddingBags. - -Here, we create an EmbeddingBagCollection (EBC) with two embedding bags. -Each table, ``product_table`` and ``user_table``, is represented by a 64 -dimension embedding of size 4096. Note how we initially allocate the EBC -on device “meta”. This will tell EBC to not allocate memory yet. - -.. code:: python - - ebc = torchrec.EmbeddingBagCollection( - device="meta", - tables=[ - torchrec.EmbeddingBagConfig( - name="product_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["product"], - pooling=torchrec.PoolingType.SUM, - ), - torchrec.EmbeddingBagConfig( - name="user_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["user"], - pooling=torchrec.PoolingType.SUM, - ) - ] - ) - - -DistributedModelParallel -~~~~~~~~~~~~~~~~~~~~~~~~ - -Now, we’re ready to wrap our model with |DistributedModelParallel|_ (DMP). Instantiating DMP will: - -1. Decide how to shard the model. DMP will collect the available - ‘sharders’ and come up with a ‘plan’ of the optimal way to shard the - embedding table(s) (i.e., the EmbeddingBagCollection). -2. Actually shard the model. This includes allocating memory for each - embedding table on the appropriate device(s). - -In this toy example, since we have two EmbeddingTables and one GPU, -TorchRec will place both on the single GPU. - -.. code:: python - - model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) - print(model) - print(model.plan) - - -Query vanilla nn.EmbeddingBag with input and offsets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We query |nn.Embedding|_ and |nn.EmbeddingBag|_ -with ``input`` and ``offsets``. Input is a 1-D tensor containing the -lookup values. Offsets is a 1-D tensor where the sequence is a -cumulative sum of the number of values to pool per example. - -Let’s look at an example, recreating the product EmbeddingBag above: - -:: - - |------------| - | product ID | - |------------| - | [101, 202] | - | [] | - | [303] | - |------------| - -.. code:: python - - product_eb = torch.nn.EmbeddingBag(4096, 64) - product_eb(input=torch.tensor([101, 202, 303]), offsets=torch.tensor([0, 2, 2])) - - -Representing minibatches with KeyedJaggedTensor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We need an efficient representation of multiple examples of an arbitrary -number of entity IDs per feature per example. In order to enable this -“jagged” representation, we use the TorchRec datastructure -|KeyedJaggedTensor|_ (KJT). - -Let’s take a look at how to lookup a collection of two embedding -bags, “product” and “user”. Assume the minibatch is made up of three -examples for three users. The first of which has two product IDs, the -second with none, and the third with one product ID. - -:: - - |------------|------------| - | product ID | user ID | - |------------|------------| - | [101, 202] | [404] | - | [] | [505] | - | [303] | [606] | - |------------|------------| - -The query should be: - -.. code:: python - - mb = torchrec.KeyedJaggedTensor( - keys = ["product", "user"], - values = torch.tensor([101, 202, 303, 404, 505, 606]).cuda(), - lengths = torch.tensor([2, 0, 1, 1, 1, 1], dtype=torch.int64).cuda(), - ) - - print(mb.to(torch.device("cpu"))) - - -Note that the KJT batch size is -``batch_size = len(lengths)//len(keys)``. In the above example, -batch_size is 3. - - - -Putting it all together, querying our distributed model with a KJT minibatch -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, we can query our model using our minibatch of products and -users. - -The resulting lookup will contain a KeyedTensor, where each key (or -feature) contains a 2D tensor of size 3x64 (batch_size x embedding_dim). - -.. code:: python - - pooled_embeddings = model(mb) - print(pooled_embeddings) - - -More resources --------------- - -For more information, please see our -`dlrm `__ -example, which includes multinode training on the criteo terabyte -dataset, using Meta’s `DLRM `__. +Redirecting... +.. raw:: html -.. |DistributedModelParallel| replace:: ``DistributedModelParallel`` -.. _DistributedModelParallel: https://pytorch.org/torchrec/torchrec.distributed.html#torchrec.distributed.model_parallel.DistributedModelParallel -.. |EmbeddingBagCollection| replace:: ``EmbeddingBagCollection`` -.. _EmbeddingBagCollection: https://pytorch.org/torchrec/torchrec.modules.html#torchrec.modules.embedding_modules.EmbeddingBagCollection -.. |KeyedJaggedTensor| replace:: ``KeyedJaggedTensor`` -.. _KeyedJaggedTensor: https://pytorch.org/torchrec/torchrec.sparse.html#torchrec.sparse.jagged_tensor.JaggedTensor -.. |torch.nn.Embedding| replace:: ``torch.nn.Embedding`` -.. _torch.nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |torch.nn.EmbeddingBag| replace:: ``torch.nn.EmbeddingBag`` -.. _torch.nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html -.. |nn.Embedding| replace:: ``nn.Embedding`` -.. _nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |nn.EmbeddingBag| replace:: ``nn.EmbeddingBag`` -.. _nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html + diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst deleted file mode 100644 index 1a11b4180f4..00000000000 --- a/intermediate_source/torchserve_with_ipex.rst +++ /dev/null @@ -1,394 +0,0 @@ -Grokking PyTorch Intel CPU performance from first principles -============================================================ - -A case study on the TorchServe inference framework optimized with `Intel® Extension for PyTorch* `_. - -Authors: Min Jean Cho, Mark Saroufim - -Reviewers: Ashok Emani, Jiong Gong - -Getting a strong out-of-box performance for deep learning on CPUs can be tricky but it’s much easier if you’re aware of the main problems that affect performance, how to measure them and how to solve them. - -TL;DR - -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Problem | How to measure it | Solution | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Bottlenecked GEMM execution units | - `Imbalance or Serial Spinning `_ | Avoid using logical cores by setting thread affinity to physical cores via core pinning | -| | - `Front-End Bound `_ | | -| | - `Core Bound `_ | | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ -| Non Uniform Memory Access (NUMA) | - Local vs. remote memory access | Avoid cross-socket computation by setting thread affinity to a specific socket via core pinning | -| | - `UPI Utilization `_ | | -| | - Latency in memory accesses | | -| | - Thread migration | | -+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+ - -*GEMM (General Matrix Multiply)* run on fused-multiply-add (FMA) or dot-product (DP) execution units which will be bottlenecked and cause delays in thread waiting/*spinning at synchronization* barrier when *hyperthreading* is enabled - because using logical cores causes insufficient concurrency for all working threads as each logical thread *contends for the same core resources*. Instead, if we use 1 thread per physical core, we avoid this contention. So we generally recommend *avoiding logical cores* by setting CPU *thread affinity* to physical cores via *core pinning*. - -Multi-socket systems have *Non-Uniform Memory Access (NUMA)* which is a shared memory architecture that describes the placement of main memory modules with respect to processors. But if a process is not NUMA-aware, slow *remote memory* is frequently accessed when *threads migrate* cross socket via *Intel Ultra Path Interconnect (UPI)* during run time. We address this problem by setting CPU *thread affinity* to a specific socket via *core pinning*. - -Knowing these principles in mind, proper CPU runtime configuration can significantly boost out-of-box performance. - -In this blog, we'll walk you through the important runtime configurations you should be aware of from `CPU Performance Tuning Guide `_, explain how they work, how to profile them and how to integrate them within a model serving framework like `TorchServe `_ via an easy to use `launch script `_ which we’ve `integrated `_ :superscript:`1` natively. - -We’ll explain all of these ideas :strong:`visually` from :strong:`first principles` with lots of :strong:`profiles` and show you how we applied our learnings to make out of the box CPU performance on TorchServe better. - -1. The feature has to be explicitly enabled by setting *cpu_launcher_enable=true* in *config.properties*. - -Avoid logical cores for deep learning -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Avoiding logical cores for deep learning workloads generally improves performance. To understand this, let us take a step back to GEMM. - -:strong:`Optimizing GEMM optimizes deep learning` - -The majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM which is at the core of fully connected layers. Fully connected layers have been used for decades since multi-layer perceptrons (MLP) `proved to be a universal approximator of any continuous function `_. Any MLP can be entirely represented as GEMM. And even a convolution can be represented as a GEMM by using a `Toepliz matrix `_. - -Returning to the original topic, most GEMM operators benefit from using non-hyperthreading, because the majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM running on fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores. With hyperthreading enabled, OpenMP threads will contend for the same GEMM execution units. - -.. figure:: /_static/img/torchserve-ipex-images/1_.png - :width: 70% - :align: center - -And if 2 logical threads run GEMM at the same time, they will be sharing the same core resources causing front end bound, such that the overhead from this front end bound is greater than the gain from running both logical threads at the same time. - -Therefore we generally recommend avoiding using logical cores for deep learning workloads to achieve good performance. The launch script by default uses physical cores only; however, users can easily experiment with logical vs. physical cores by simply toggling the ``--use_logical_core`` launch script knob. - -:strong:`Exercise` - -We'll use the following example of feeding ResNet50 dummy tensor: - -.. code:: python - - import torch - import torchvision.models as models - import time - - model = models.resnet50(pretrained=False) - model.eval() - data = torch.rand(1, 3, 224, 224) - - # warm up - for _ in range(100): - model(data) - - start = time.time() - for _ in range(100): - model(data) - end = time.time() - print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) - -Throughout the blog, we'll use `Intel® VTune™ Profiler `_ to profile and verify optimizations. And we'll run all exercises on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. The CPU information is shown in Figure 2.1. - -Environment variable ``OMP_NUM_THREADS`` is used to set the number of threads for parallel region. We'll compare ``OMP_NUM_THREADS=2`` with (1) use of logical cores and (2) use of physical cores only. - -(1) Both OpenMP threads trying to utilize the same GEMM execution units shared by hyperthreading cores (0, 56) - -We can visualize this by running ``htop`` command on Linux as shown below. - -.. figure:: /_static/img/torchserve-ipex-images/2.png - :width: 100% - :align: center - - -.. figure:: /_static/img/torchserve-ipex-images/3.png - :width: 100% - :align: center - -We notice that the Spin Time is flagged, and Imbalance or Serial Spinning contributed to the majority of it - 4.980 seconds out of the 8.982 seconds total. The Imbalance or Serial Spinning when using logical cores is due to insufficient concurrency of working threads as each logical thread contends for the same core resources. - -The Top Hotspots section of the execution summary indicates that ``__kmp_fork_barrier`` took 4.589 seconds of CPU time - during 9.33% of the CPU execution time, threads were just spinning at this barrier due to thread synchronization. - -(2) Each OpenMP thread utilizing GEMM execution units in respective physical cores (0,1) - - -.. figure:: /_static/img/torchserve-ipex-images/4.png - :width: 80% - :align: center - - -.. figure:: /_static/img/torchserve-ipex-images/5.png - :width: 80% - :align: center - -We first note that the execution time dropped from 32 seconds to 23 seconds by avoiding logical cores. While there's still some non-negligible Imbalance or Serial Spinning, we note relative improvement from 4.980 seconds to 3.887 seconds. - -By not using logical threads (instead, using 1 thread per physical core), we avoid logical threads contending for the same core resources. The Top Hotspots section also indicates relative improvement of ``__kmp_fork_barrier`` time from 4.589 seconds to 3.530 seconds. - -Local memory access is always faster than remote memory access -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We generally recommend binding a process to a local socket such that the process does not migrate across sockets. Generally the goal of doing so is to utilize high speed cache on local memory and to avoid remote memory access which can be ~2x slower. - - -.. figure:: /_static/img/torchserve-ipex-images/6.png - :width: 80% - :align: center -Figure 1. Two-socket configuration - -Figure 1. shows a typical two-socket configuration. Notice that each socket has its own local memory. Sockets are connected to each other via Intel Ultra Path Interconnect (UPI) which allows each socket to access the local memory of another socket called remote memory. Local memory access is always faster than remote memory access. - -.. figure:: /_static/img/torchserve-ipex-images/7.png - :width: 50% - :align: center -Figure 2.1. CPU information - -Users can get their CPU information by running ``lscpu`` command on their Linux machine. Figure 2.1. shows an example of ``lscpu`` execution on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. Notice that there are 28 cores per socket, and 2 threads per core (i.e., hyperthreading is enabled). In other words, there are 28 logical cores in addition to 28 physical cores, giving a total of 56 cores per socket. And there are 2 sockets, giving a total of 112 cores (``Thread(s) per core`` x ``Core(s) per socket`` x ``Socket(s)``). - -.. figure:: /_static/img/torchserve-ipex-images/8.png - :width: 100% - :align: center -Figure 2.2. CPU information - -The 2 sockets are mapped to 2 NUMA nodes (NUMA node 0, NUMA node 1) respectively. Physical cores are indexed prior to logical cores. As shown in Figure 2.2., the first 28 physical cores (0-27) and the first 28 logical cores (56-83) on the first socket are on NUMA node 0. And the second 28 physical cores (28-55) and the second 28 logical cores (84-111) on the second socket are on NUMA node 1. Cores on the same socket share local memory and last level cache (LLC) which is much faster than cross-socket communication via Intel UPI. - -Now that we understand NUMA, cross-socket (UPI) traffic, local vs. remote memory access in multi-processor systems, let's profile and verify our understanding. - -:strong:`Exercise` - -We'll reuse the ResNet50 example above. - -As we did not pin threads to processor cores of a specific socket, the operating system periodically schedules threads on processor cores located in different sockets. - -.. figure:: /_static/img/torchserve-ipex-images/9.gif - :width: 100% - :align: center - -Figure 3. CPU usage of non NUMA-aware application. 1 main worker thread was launched, then it launched a physical core number (56) of threads on all cores, including logical cores. - -(Aside: If the number of threads is not set by `torch.set_num_threads `_, the default number of threads is the number of physical cores in a hyperthreading enabled system. This can be verified by `torch.get_num_threads `_. Hence we see above about half of the cores busy running the example script.) - -.. figure:: /_static/img/torchserve-ipex-images/10.png - :width: 100% - :align: center -Figure 4. Non-Uniform Memory Access Analysis graph - - -Figure 4. compares local vs. remote memory access over time. We verify usage of remote memory which could result in sub-optimal performance. - -:strong:`Set thread affinity to reduce remote memory access and cross-socket (UPI) traffic` - -Pinning threads to cores on the same socket helps maintain locality of memory access. In this example, we'll pin to the physical cores on the first NUMA node (0-27). With the launch script, users can easily experiment with NUMA nodes configuration by simply toggling the ``--node_id`` launch script knob. - -Let's visualize the CPU usage now. - -.. figure:: /_static/img/torchserve-ipex-images/11.gif - :width: 100% - :align: center -Figure 5. CPU usage of NUMA-aware application - -1 main worker thread was launched, then it launched threads on all physical cores on the first numa node. - -.. figure:: /_static/img/torchserve-ipex-images/12.png - :width: 100% - :align: center -Figure 6. Non-Uniform Memory Access Analysis graph - -As shown in Figure 6., now almost all memory accesses are local accesses. - -Efficient CPU usage with core pinning for multi-worker inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When running multi-worker inference, cores are overlapped (or shared) between workers causing inefficient CPU usage. To address this problem, the launch script equally divides the number of available cores by the number of workers such that each worker is pinned to assigned cores during runtime. - -:strong:`Exercise with TorchServe` - -For this exercise, let's apply the CPU performance tuning principles and recommendations that we have discussed so far to `TorchServe apache-bench benchmarking `_. - -We'll use ResNet50 with 4 workers, concurrency 100, requests 10,000. All other parameters (e.g., batch_size, input, etc) are the same as the `default parameters `_. - -We'll compare the following three configurations: - -(1) default TorchServe setting (no core pinning) - -(2) `torch.set_num_threads `_ = ``number of physical cores / number of workers`` (no core pinning) - -(3) core pinning via the launch script (Required Torchserve>=0.6.1) - -After this exercise, we'll have verified that we prefer avoiding logical cores and prefer local memory access via core pinning with a real TorchServe use case. - -1. Default TorchServe setting (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The `base_handler `_ doesn't explicitly set `torch.set_num_threads `_. Hence the default number of threads is the number of physical CPU cores as described `here `_. Users can check the number of threads by `torch.get_num_threads `_ in the base_handler. Each of the 4 main worker threads launches a physical core number (56) of threads, launching a total of 56x4 = 224 threads, which is more than the total number of cores 112. Therefore cores are guaranteed to be heavily overlapped with high logical core utilization- multiple workers using multiple cores at the same time. Furthermore, because threads are not affinitized to specific CPU cores, the operating system periodically schedules threads to cores located in different sockets. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/13.png - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a physical core number (56) of threads on all cores, including logical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/14.png - :width: 80% - :align: center - -We observe a very high Core Bound stall of 88.4%, decreasing pipeline efficiency. Core Bound stalls indicate sub-optimal use of available execution units in the CPU. For example, several GEMM instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores could cause Core Bound stalls. And as described in the previous section, use of logical cores amplifies this problem. - - -.. figure:: /_static/img/torchserve-ipex-images/15.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/16.png - :width: 50% - :align: center - -An empty pipeline slot not filled with micro-ops (uOps) is attributed to a stall. For example, without core pinning CPU usage may not effectively be on compute but on other operations like thread scheduling from Linux kernel. We see above that ``__sched_yield`` contributed to the majority of the Spin Time. - -3. Thread Migration - -Without core pinning, scheduler may migrate thread executing on a core to a different core. Thread migration can disassociate the thread from data that has already been fetched into the caches resulting in longer data access latencies. This problem is exacerbated in NUMA systems when thread migrates across sockets. Data that has been fetched to high speed cache on local memory now becomes remote memory, which is much slower. - -.. figure:: /_static/img/torchserve-ipex-images/17.png - :width: 50% - :align: center - -Generally the total number of threads should be less than or equal to the total number of threads supported by the core. In the above example, we notice a large number of threads executing on core_51 instead of the expected 2 threads (since hyperthreading is enabled in Intel(R) Xeon(R) Platinum 8180 CPUs) . This indicates thread migration. - -.. figure:: /_static/img/torchserve-ipex-images/18.png - :width: 80% - :align: center - -Additionally, notice that thread (TID:97097) was executing on a large number of CPU cores, indicating CPU migration. For example, this thread was executing on cpu_81, then migrated to cpu_14, then migrated to cpu_5, and so on. Furthermore, note that this thread migrated cross socket back and forth many times, resulting in very inefficient memory access. For example, this thread executed on cpu_70 (NUMA node 0), then migrated to cpu_100 (NUMA node 1), then migrated to cpu_24 (NUMA node 0). - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/19.png - :width: 100% - :align: center - -Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration. - -2. torch.set_num_threads = ``number of physical cores / number of workers`` (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For an apple-to-apple comparison with launcher's core pinning, we'll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the `base_handler `_: - -.. code:: python - - torch.set_num_threads(num_physical_cores/num_workers) - -As before without core pinning, these threads are not affinitized to specific CPU cores, causing the operating system to periodically schedule threads on cores located in different sockets. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/20.gif - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads on all cores, including logical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/21.png - :width: 80% - :align: center - -Although the percentage of Core Bound stalls has decreased from 88.4% to 73.5%, the Core Bound is still very high. - -.. figure:: /_static/img/torchserve-ipex-images/22.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/23.png - :width: 50% - :align: center - -3. Thread Migration - -.. figure:: /_static/img/torchserve-ipex-images/24.png - :width: 75% - :align: center - -Similar as before, without core pinning thread (TID:94290) was executing on a large number of CPU cores, indicating CPU migration. We notice again cross-socket thread migration, resulting in very inefficient memory access. For example, this thread executed on cpu_78 (NUMA node 0), then migrated to cpu_108 (NUMA node 1). - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/25.png - :width: 100% - :align: center - -Although an improvement from the original 51.09%, still 40.45% of memory access is remote, indicating sub-optimal NUMA configuration. - -3. launcher core pinning -~~~~~~~~~~~~~~~~~~~~~~~~ -Launcher will internally equally distribute physical cores to workers, and bind them to each worker. As a reminder, launcher by default uses physical cores only. In this example, launcher will bind worker 0 to cores 0-13 (NUMA node 0), worker 1 to cores 14-27 (NUMA node 0), worker 2 to cores 28-41 (NUMA node 1), and worker 3 to cores 42-55 (NUMA node 1). Doing so ensures that cores are not overlapped among workers and avoids logical core usage. - -1. CPU usage - -.. figure:: /_static/img/torchserve-ipex-images/26.gif - :width: 100% - :align: center - -4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads affinitized to the assigned physical cores. - -2. Core Bound stalls - -.. figure:: /_static/img/torchserve-ipex-images/27.png - :width: 80% - :align: center - -Core Bound stalls has decreased significantly from the original 88.4% to 46.2% - almost a 2x improvement. - -.. figure:: /_static/img/torchserve-ipex-images/28.png - :width: 40% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images/29.png - :width: 50% - :align: center - -We verify that with core binding, most CPU time is effectively used on compute - Spin Time of 0.256s. - -3. Thread Migration - -.. figure:: /_static/img/torchserve-ipex-images/30.png - :width: 100% - :align: center - -We verify that `OMP Primary Thread #0` was bound to assigned physical cores (42-55), and did not migrate cross-socket. - -4. Non Uniform Memory Access Analysis - -.. figure:: /_static/img/torchserve-ipex-images/31.png - :width: 100% - :align: center - -Now almost all, 89.52%, memory accesses are local accesses. - -Conclusion -~~~~~~~~~~ - -In this blog, we've showcased that properly setting your CPU runtime configuration can significantly boost out-of-box CPU performance. - -We have walked through some general CPU performance tuning principles and recommendations: - -- In a hyperthreading enabled system, avoid logical cores by setting thread affinity to physical cores only via core pinning. -- In a multi-socket system with NUMA, avoid cross-socket remote memory access by setting thread affinity to a specific socket via core pinning. - -We have visually explained these ideas from first principles and have verified the performance boost with profiling. And finally, we have applied all of our learnings to TorchServe to boost out-of-box TorchServe CPU performance. - -These principles can be automatically configured via an easy to use launch script which has already been integrated into TorchServe. - -For interested readers, please check out the following documents: - -- `CPU specific optimizations `_ -- `Maximize Performance of Intel® Software Optimization for PyTorch* on CPU `_ -- `Performance Tuning Guide `_ -- `Launch Script Usage Guide `_ -- `Top-down Microarchitecture Analysis Method `_ -- `Configuring oneDNN for Benchmarking `_ -- `Intel® VTune™ Profiler `_ -- `Intel® VTune™ Profiler User Guide `_ - -And stay tuned for a follow-up posts on optimized kernels on CPU via `Intel® Extension for PyTorch* `_ and advanced launcher configurations such as memory allocator. - -Acknowledgement -~~~~~~~~~~~~~~~ - -We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this blog. We would also like to thank Hamid Shojanazeri (Meta), Li Ning (AWS) and Jing Xu (Intel) for helpful feedback in code review. And Suraj Subramanian (Meta) and Geeta Chauhan (Meta) for helpful feedback on the blog. diff --git a/intermediate_source/torchserve_with_ipex_2.rst b/intermediate_source/torchserve_with_ipex_2.rst deleted file mode 100644 index 64f3db6b27c..00000000000 --- a/intermediate_source/torchserve_with_ipex_2.rst +++ /dev/null @@ -1,447 +0,0 @@ -Grokking PyTorch Intel CPU performance from first principles (Part 2) -===================================================================== - -Authors: `Min Jean Cho `_, `Jing Xu `_, `Mark Saroufim `_ - -In the `Grokking PyTorch Intel CPU Performance From First Principles `_ tutorial -, we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into `TorchServe `_ for optimized CPU performance. - -In this tutorial, we will demonstrate boosting performance with memory allocator via the `Intel® Extension for PyTorch* Launcher `_ -, and optimized kernels on CPU via `Intel® Extension for PyTorch* `_ -, and apply them to TorchServe showcasing 7.71x throughput speedup for ResNet50 and 2.20x throughput speedup for BERT. - -.. figure:: /_static/img/torchserve-ipex-images-2/1.png - :width: 100% - :align: center - -Prerequisites -------------- -Throughout this tutorial, we will use `Top-down Microarchitecture Analysis (TMA) `_ to profile and show that the Back End Bound (Memory Bound, Core Bound) is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound. We will use `toplev `_, a tool part of `pmu-tools `_ built on top of `Linux perf `_, for TMA. - -We will also use `Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) `__ to profile at finer granularity. - -Top-down Microarchitecture Analysis Method (TMA) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When tuning CPU for optimal performance, it's useful to know where the bottleneck is. Most CPU cores have on-chip Performance Monitoring Units (PMUs). PMUs are dedicated pieces of logic within a CPU core that count specific hardware events as they occur on the system. Examples of these events may be Cache Misses or Branch Mispredictions. PMUs are used for Top-down Microarchitecture Analysis (TMA) to identify the bottlenecks. TMA consists of hierarchical levels as shown: - -.. figure:: /_static/img/torchserve-ipex-images-2/2.png - :width: 100% - :align: center - -The top level, level-1, metrics collect *Retiring*, *Bad Speculation*, *Front End Bound*, *Back End Bound*. The pipeline of CPU can conceptually be simplified and divided into two: the frontend and the backend. The *frontend* is responsible for fetching the program code and decoding them into low-level hardware operations called micro-ops (uOps). The uOps are then fed to the *backend* in a process called allocation. Once allocated, the backend is responsible for executing the uOp in an available execution unit. A completion of uOp's execution is called *retirement*. In contrast, a *bad speculation* is when speculatively fetched uOps are canceled before retiring such as in the case of mispredicted branches. Each of these metrics can further be broken down in the subsequent levels to pinpoint the bottleneck. - -Tune for the Back End Bound -+++++++++++++++++++++++++++ -The majority of untuned deep learning workloads will be Back End Bound. Resolving Back End bound is often resolving sources of latency causing retirement to take longer than necessary. As shown above, Back End Bound has two sub-metrics – Core Bound and Memory Bound. - -Memory Bound stalls have causes related to the memory subsystem. For example, last-level cache (LLC or L3 cache) miss causing access to DRAM. Scaling deep learning models often requires significant compute. And high compute utilization requires that data is available when the execution units need it to execute the uOps. This requires prefetching the data and reusing the data in cache instead of fetching that same data multiple times from main memory which causes execution units to be starved while data is being returned. Throughout this tutorial, we wll show that a more efficient memory allocator, operator fusion, memory layout format optimization reduce overhead on Memory Bound with better cache locality. - -Core Bound stalls indicate sub-optimal use of available execution units while there are no uncompleted memory accesses. For example, several general matrix-matrix multiplication (GEMM) instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units could cause Core Bound stalls. Key deep learning kernels, including the DP kernels, have been well optimized by `oneDNN library `_ (oneAPI Deep Neural Network Library), reducing overhead on Core Bound. - -Operations like GEMM, convolution, deconvolution are compute-intensive. While operations like pooling, batch normalization, activation functions like ReLU are memory-bound. - -Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ITT APIs of Intel® VTune Profiler is a useful tool to annotate a region of your workload for tracing to profile and visualize at a finer granularity of your annotation – OP/function/sub-function granularity. By annotating at the granularity of your PyTorch model's OPs, Intel® VTune Profiler's ITT enables op-level profiling. Intel® VTune Profiler's ITT has been integrated into `PyTorch Autograd Profiler `_. :superscript:`1` - -1. The feature has to be explicitly enabled by *with torch.autograd.profiler.emit_itt()*. - -TorchServe with Intel® Extension for PyTorch* ---------------------------------------------- -`Intel® Extension for PyTorch* `__ is a Python package to extend PyTorch with optimizations for extra performance boost on Intel hardware. - -Intel® Extension for PyTorch* has already been integrated into TorchServe to improve the performance out-of-box. :superscript:`2` For custom handler scripts, we recommend adding the *intel_extension_for_pytorch* package in. - -2. The feature has to be explicitly enabled by setting *ipex_enable=true* in *config.properties*. - -Throughout this section, we will show that Back End Bound is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound, which has two submetrics - Memory Bound, and Core Bound. A more efficient memory allocator, operator fusion, memory layout format optimization improve Memory Bound. Ideally, Memory Bound can be improved to Core Bound by optimized operators and better cache locality. And key deep learning primitives, such as convolution, matrix multiplication, dot-product, have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound. - -Leveraging Advanced Launcher Configuration: Memory Allocator -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Memory allocator plays an important role from performance perspective. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus faster execution. For deep learning workloads in practice, especially those running on large multi-core systems or servers like TorchServe, TCMalloc, or JeMalloc can generally get better memory usage than the default PyTorch memory allocator, PTMalloc. - -TCMalloc, JeMalloc, PTMalloc -++++++++++++++++++++++++++++ -Both TCMalloc and JeMalloc use thread-local caches to reduce overhead on thread synchronization, and lock contention by using spinlocks and per-thread arenas respectively. TCMalloc and JeMalloc reduce overhead on unnecessary memory allocation and deallocation. Both allocators categorize memory allocations by sizes to reduce overhead on memory fragmentation. - -With the launcher, users can easily experiment with different memory allocators by choosing one of the three launcher knobs *--enable_tcmalloc* (TCMalloc), *--enable_jemalloc* (JeMalloc), *--use_default_allocator* (PTMalloc). - -Exercise -^^^^^^^^ -Let's profile PTMalloc vs. JeMalloc. - -We will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket to avoid any NUMA complication – to profile the effect of memory allocator only. - -The following example measures the average inference time of ResNet50: - -.. code:: python - - import torch - import torchvision.models as models - import time - - model = models.resnet50(pretrained=False) - model.eval() - batch_size = 32 - data = torch.rand(batch_size, 3, 224, 224) - - # warm up - for _ in range(100): - model(data) - - # measure - # Intel® VTune Profiler's ITT context manager - with torch.autograd.profiler.emit_itt(): - start = time.time() - for i in range(100): - # Intel® VTune Profiler's ITT to annotate each step - torch.profiler.itt.range_push('step_{}'.format(i)) - model(data) - torch.profiler.itt.range_pop() - end = time.time() - - print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/3.png - :width: 100% - :align: center - -Level-1 TMA shows that both PTMalloc and JeMalloc are bounded by the backend. More than half of the execution time was stalled by the backend. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/4.png - :width: 100% - :align: center - -Level-2 TMA shows that the Back End Bound was caused by Memory Bound. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/5.png - :width: 100% - :align: center - -Most of the metrics under the Memory Bound identify which level of the memory hierarchy from the L1 cache to main memory is the bottleneck. A hotspot bounded at a given level indicates that most of the data was being retrieved from that cache or memory-level. Optimizations should focus on moving data closer to the core. Level-3 TMA shows that PTMalloc was bottlenecked by DRAM Bound. On the other hand, JeMalloc was bottlenecked by L1 Bound – JeMalloc moved data closer to the core, and thus faster execution. - -Let's look at Intel® VTune Profiler ITT trace. In the example script, we have annotated each *step_x* of the inference loop. - -.. figure:: /_static/img/torchserve-ipex-images-2/6.png - :width: 100% - :align: center - -Each step is traced in the timeline graph. The duration of model inference on the last step (step_99) decreased from 304.308 ms to 261.843 ms. - -Exercise with TorchServe -^^^^^^^^^^^^^^^^^^^^^^^^ -Let's profile PTMalloc vs. JeMalloc with TorchServe. - -We will use `TorchServe apache-bench benchmarking `_ with ResNet50 FP32, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the `default parameters `_. - -As in the previous exercise, we will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in `config.properties `__: - -PTMalloc - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 --use_default_allocator - -JeMalloc - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 --enable_jemalloc - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/7.png - :width: 100% - :align: center - -Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/8.png - :width: 100% - :align: center - -Let's use Intel® VTune Profiler ITT to annotate `TorchServe inference scope `_ to profile at inference-level granularity. As `TorchServe Architecture `_ consists of several sub-components, including the Java frontend for handling request/response, and the Python backend for running the actual inference on the models, it is helpful to use Intel® VTune Profiler ITT to limit the collection of trace data at inference-level. - -.. figure:: /_static/img/torchserve-ipex-images-2/9.png - :width: 100% - :align: center - -Each inference call is traced in the timeline graph. The duration of the last model inference decreased from 561.688 ms to 251.287 ms - 2.2x speedup. - -.. figure:: /_static/img/torchserve-ipex-images-2/10.png - :width: 100% - :align: center - -The timeline graph can be expanded to see op-level profiling results. The duration of *aten::conv2d* decreased from 16.401 ms to 6.392 ms - 2.6x speedup. - -In this section, we have demonstrated that JeMalloc can give better performance than the default PyTorch memory allocator, PTMalloc, with efficient thread-local caches improving Back-End-Bound. - -Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The three major `Intel® Extension for PyTorch* `__ optimization techniques, Operator, Graph, Runtime, are as shown: - -+------------------------------------------------------------------------------------------------------------------------+ -| Intel® Extension for PyTorch* Optimization Techniques | -+======================================================+=======================================+=========================+ -| Operator | Graph | Runtime | -+------------------------------------------------------+---------------------------------------+-------------------------+ -| - Vectorization and Multi-threading | - Constant folding to reduce compute | - Thread affinitization | -| - Low-precision BF16/INT8 compute | - Op fusion for better cache locality | - Memory buffer pooling | -| - Data layout optimization for better cache locality | | - GPU runtime | -| | | - Launcher | -+------------------------------------------------------+---------------------------------------+-------------------------+ - -Operator Optimization -+++++++++++++++++++++ -Optimized operators and kernels are registered through PyTorch dispatching mechanism. These operators and kernels are accelerated from native vectorization feature and matrix calculation feature of Intel hardware. During execution, Intel® Extension for PyTorch* intercepts invocation of ATen operators, and replaces the original ones with these optimized ones. Popular operators like Convolution, Linear have been optimized in Intel® Extension for PyTorch*. - -Exercise -^^^^^^^^ -Let's profile optimized operator with Intel® Extension for PyTorch*. We will compare with and without the lines in code changes. - -As in the previous exercises, we will bind the workload to physical cores of the first socket. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - print(model) - -The model consists of two operations—Conv2d and ReLU. By printing the model object, we get the following output. - -.. figure:: /_static/img/torchserve-ipex-images-2/11.png - :width: 60% - :align: center - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/12.png - :width: 100% - :align: center - -Notice the Back End Bound reduced from 68.9 to 38.5 – 1.8x speedup. - -Additionally, let's profile with PyTorch Profiler. - -.. figure:: /_static/img/torchserve-ipex-images-2/13.png - :width: 100% - :align: center - -Notice the CPU time reduced from 851 us to 310 us – 2.7X speedup. - -Graph Optimization -++++++++++++++++++ -It is highly recommended for users to take advantage of Intel® Extension for PyTorch* with `TorchScript `_ for further graph optimizations. To optimize performance further with TorchScript, Intel® Extension for PyTorch* supports oneDNN fusion of frequently used FP32/BF16 operator patterns, like Conv2D+ReLU, Linear+ReLU, and more to reduce operator/kernel invocation overheads, and for better cache locality. Some operator fusions allow to maintain temporary calculations, data type conversions, data layouts for better cache locality. As well as for INT8, Intel® Extension for PyTorch* has built-in quantization recipes to deliver good statistical accuracy for popular DL workloads including CNN, NLP and recommendation models. The quantized model is then optimized with oneDNN fusion support. - -Exercise -^^^^^^^^ -Let's profile FP32 graph optimization with TorchScript. - -As in the previous exercises, we will bind the workload to physical cores of the first socket. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - # torchscript - with torch.no_grad(): - model = torch.jit.trace(model, data) - model = torch.jit.freeze(model) - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/14.png - :width: 100% - :align: center - -Notice the Back End Bound reduced from 67.1 to 37.5 – 1.8x speedup. - -Additionally, let's profile with PyTorch Profiler. - -.. figure:: /_static/img/torchserve-ipex-images-2/15.png - :width: 100% - :align: center - -Notice that with Intel® Extension for PyTorch* Conv + ReLU operators are fused, and the CPU time reduced from 803 us to 248 us – 3.2X speedup. The oneDNN eltwise post-op enables fusing a primitive with an elementwise primitive. This is one of the most popular kinds of fusion: an eltwise (typically an activation function such as ReLU) with preceding convolution or inner product. Have a look at the oneDNN verbose log shown in the next section. - -Channels Last Memory Format -+++++++++++++++++++++++++++ -When invoking *ipex.optimize* on model, Intel® Extension for PyTorch* automatically converts the model to optimized memory format, channels last. Channels last is a memory format that is more friendly to Intel Architecture. Compared to PyTorch default channels first NCHW (batch, channels, height, width) memory format, channels last NHWC (batch, height, width, channels) memory format generally accelerates convolutional neural networks with better cache locality. - -One thing to note is that it is expensive to convert memory format. So it's better to convert the memory format prior to deployment once, and keep the memory format conversion minimum during deployment. As the data propagates through model's layers the channels last memory format is preserved through consecutive channels last supported layers (for example, Conv2d -> ReLU -> Conv2d) and conversions are only made in between channels last unsupported layers. See `Memory Format Propagation `_ for more details. - -Exercise -^^^^^^^^ -Let's demonstrate channels last optimization. - -.. code:: python - - import torch - - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.conv = torch.nn.Conv2d(16, 33, 3, stride=2) - self.relu = torch.nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.relu(x) - return x - - model = Model() - model.eval() - data = torch.rand(20, 16, 50, 100) - - import intel_extension_for_pytorch as ipex - ############################### code changes ############################### - ipex.disable_auto_channels_last() # omit this line for channels_last (default) - ############################################################################ - model = ipex.optimize(model) - - with torch.no_grad(): - model = torch.jit.trace(model, data) - model = torch.jit.freeze(model) - -We will use `oneDNN verbose mode `_, a tool to help collect information at oneDNN graph level such as operator fusions, kernel execution time spent on executing oneDNN primitives. For more information, refer to the `oneDNN Documentation `_. - -.. figure:: /_static/img/torchserve-ipex-images-2/16.png - :width: 15% - :align: center - -.. figure:: /_static/img/torchserve-ipex-images-2/17.png - :width: 100% - :align: center - -Above is oneDNN verbose from channels first. We can verify that there are reorders from weight and data, then do computation, and finally reorder output back. - -.. figure:: /_static/img/torchserve-ipex-images-2/18.png - :width: 80% - :align: center - -Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders. - -Performance Boost with Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Below summarizes performance boost of TorchServe with Intel® Extension for PyTorch* for ResNet50 and BERT-base-uncased. - -.. figure:: /_static/img/torchserve-ipex-images-2/19.png - :width: 100% - :align: center - -Exercise with TorchServe -~~~~~~~~~~~~~~~~~~~~~~~~ -Let's profile Intel® Extension for PyTorch* optimizations with TorchServe. - -We will use `TorchServe apache-bench benchmarking `_ with ResNet50 FP32 TorchScript, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the `default parameters `_. - -As in the previous exercise, we will use the launcher to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in `config.properties `__: - -.. code:: python - - cpu_launcher_enable=true - cpu_launcher_args=--node_id 0 - -Let's collect level-1 TMA metrics. - -.. figure:: /_static/img/torchserve-ipex-images-2/20.png - :width: 100% - :align: center - -Level-1 TMA shows that both are bounded by the backend. As discussed earlier, the majority of untuned deep learning workloads will be Back End Bound. Notice the Back End Bound reduced from 70.0 to 54.1. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/21.png - :width: 100% - :align: center - -As discussed earlier, Back End Bound has two submetrics – Memory Bound and Core Bound. Memory Bound indicates the workload is under-optimized or under-utilized, and ideally memory-bound operations can be improved to core-bound by optimizing the OPs and improving cache locality. Level-2 TMA shows that the Back End Bound improved from Memory Bound to Core Bound. Let's go one level deeper. - -.. figure:: /_static/img/torchserve-ipex-images-2/22.png - :width: 100% - :align: center - -Scaling deep learning models for production on a model serving framework like TorchServe requires high compute utilization. This requires that data is available through prefetching and reusing the data in cache when the execution units need it to execute the uOps. Level-3 TMA shows that the Back End Memory Bound improved from DRAM Bound to Core Bound. - -As in the previous exercise with TorchServe, let's use Intel® VTune Profiler ITT to annotate `TorchServe inference scope `_ to profile at inference-level granularity. - -.. figure:: /_static/img/torchserve-ipex-images-2/23.png - :width: 100% - :align: center - -Each inference call is traced in the timeline graph. The duration of the last inference call decreased from 215.731 ms to 95.634 ms - 2.3x speedup. - -.. figure:: /_static/img/torchserve-ipex-images-2/24.png - :width: 100% - :align: center - -The timeline graph can be expanded to see op-level profiling results. Notice that Conv + ReLU has been fused, and the duration decreased from 6.393 ms + 1.731 ms to 3.408 ms - 2.4x speedup. - -Conclusion ------------ -In this tutorial, we have used Top-down Microarchitecture Analysis (TMA) and Intel® VTune™ Profiler's Instrumentation and Tracing Technology (ITT) to demonstrate that - -- Often the primary bottleneck of under-optimized or under-tuned deep learning workloads are Back End Bound, which has two submetrics, Memory Bound and Core Bound. - -- A more efficient memory allocator, operator fusion, memory layout format optimization by Intel® Extension for PyTorch* improve Memory Bound. - -- Key deep learning primitives, such as convolution, matrix multiplication, dot-product, etc have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound. - -- Intel® Extension for PyTorch* has been integrated into TorchServe with an ease-of-use API. - -- TorchServe with Intel® Extension for PyTorch* shows 7.71x throughput speedup for ResNet50, and 2.20x throughput speedup for BERT. - -Related Readings ----------------- -`Top-down Microarchitecture Analysis Method `_ - -`Top-Down performance analysis methodology `_ - -`Accelerating PyTorch with Intel® Extension for PyTorch* `_ - -Acknowledgement ---------------- -We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this tutorial. We would also like to thank Hamid Shojanazeri (Meta) and Li Ning (AWS) for their helpful feedback in code review and the tutorial. diff --git a/intermediate_source/torchvision_tutorial.py b/intermediate_source/torchvision_tutorial.py index d1e4c5c5d5e..a19d3930436 100644 --- a/intermediate_source/torchvision_tutorial.py +++ b/intermediate_source/torchvision_tutorial.py @@ -406,14 +406,14 @@ def get_transform(train): ###################################################################### -# Let’s now write the main function which performs the training and the -# validation: +# We want to be able to train our model on an `accelerator `_ +# such as CUDA, MPS, MTIA, or XPU. Let’s now write the main function which performs the training and the validation: from engine import train_one_epoch, evaluate -# train on the GPU or on the CPU, if a GPU is not available -device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') +# train on the accelerator or on the CPU, if an accelerator is not available +device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py new file mode 100644 index 00000000000..decaf0602f7 --- /dev/null +++ b/intermediate_source/transformer_building_blocks.py @@ -0,0 +1,846 @@ +""" +.. meta:: + :description: Learn how to optimize transformer models by replacing nn.Transformer with Nested Tensors and torch.compile() for significant performance gains in PyTorch. + +Accelerating PyTorch Transformers by replacing ``nn.Transformer`` with Nested Tensors and ``torch.compile()`` +============================================================================================================= +**Author:** `Mikayla Gawarecki `_ + + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Learn about the low-level building blocks PyTorch provides to build custom transformer layers ( + nested tensors, ``scaled_dot_product_attention``, ``torch.compile()``, and ``FlexAttention``) + * Discover how the above improve memory usage and performance using MultiHeadAttention as an example + * Explore advanced customizations using the aforementioned building blocks + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v.2.6.0 or later + + +Over the past few years, the PyTorch team has developed various lower level +features that, when composed, can create a variety of transformer variants. These +include: + +* Nested Tensors with the ``torch.jagged`` layout (AKA NJTs) +* ``scaled_dot_product_attention`` +* ``torch.compile()`` +* ``FlexAttention`` + +This tutorial will give a brief overview of the above technologies and +demonstrate how they can be composed to yield flexible and performant transformer \ +layers with improved user experience. + +One may observe that the ``torch.nn`` module currently provides various ``Transformer``-related layers. +In particular, it includes ``TransformerEncoderLayer``, ``TransformerEncoder``, ``TransformerDecoderLayer``, +``TransformerDecoder``, ``Transformer`` and ``MultiheadAttention``. This family +of layers was initially implemented following the `Attention is All +You Need `_ paper. The components discussed in +this tutorial provide improved user experience, flexibility and performance over +the existing ``nn`` layers. + + +Is this tutorial for me? +======================== + +If you are wondering about what building blocks the ``torch`` library provides +for writing your own transformer layers and best practices, you are in the +right place. Please keep reading! + +If you are looking for an out-of-the-box implementation of a popular transformer +architecture, note that there are many open-source libraries that provide them, +including: + +* `HuggingFace transformers `_ +* `xformers `_ +* `torchtune `_ + +If you are only interested in performant attention score modifications, please +check out the `FlexAttention blog `_ that +contains a `gym of masks `_. + +""" + +################################################################################ +# Introducing the Building Blocks +# =============================== +# First, we will briefly introduce the four technologies mentioned in the introduction +# +# * `torch.nested `_ +# +# Nested tensors generalize the shape of regular dense tensors, allowing for +# representation of ragged-sized data with the same tensor UX. In the context of +# transformers, we can think of nested tensors as a tool for representing variable +# sequence lengths. They eliminate the need for the bug-prone practices of explicit +# padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``). +# +# * `scaled_dot_product_attention `_ +# +# ``scaled_dot_product_attention`` is a primitive for +# :math:`\text{softmax}(\frac{QK^T}{\sqrt{E}} + B)V` that dispatches into either fused +# implementations of the operator or a fallback implementation. It works out of +# the box in eager mode (i.e. the default mode of using PyTorch where operations +# are executed on the fly as they are encountered) and also integrates seamlessly +# with ``torch.compile()``. As of 2.6, it will also offer grouped query attention +# natively. +# +# * `torch.compile() `_ +# +# ``torch.compile()`` is a compiler introduced in version 2.0 that is able to +# capture a graph of PyTorch code and perform various optimizations on it, such as +# fusing together sequences of ops. Nested tensors with the ``torch.jagged`` layout +# and ``scaled_dot_product_attention`` work seamlessly with compile. In the +# context of transformers, the value add of using compile with nested tensor +# and SDPA is that compile can remove framework overhead ones sees in eager mode +# and fuse sequences of ops in transformers together, such as projection and +# activation. +# +# * `FlexAttention `_ +# +# ``FlexAttention`` is a primitive that allows users to modify attention scores +# prior to the softmax operation. It generalizes the additive ``B`` term above +# for ``scaled_dot_product_attention``, allowing for arbitrary calculation. It +# requires compile to achieve good performance. +# +# The above building blocks are "All You Need" (as of October 2024) +# ================================================================== +# +# The main premise in this section is that most transformer variations are +# GPT-style, consisting of layers like Embedding, Positional Encoding, Attention +# Blocks and Feed Forward networks. If we were to try to classify the differences +# in this space, we might land on something like: +# +# 1. Layer type (activation functions such as ``SwiGLU`` and others, normalization functions +# such as ``RMSNorm`` and others, positional encodings, such as Sinusoidal, Rotary.) +# 2. Layer ordering, such as where to apply norms and positional encoding. +# 3. Modifications to attention score, such as ``ALiBi``, Relative Positional Bias and so on. +# +# +# In a pre-compiler environment, you might write a custom transformer and notice +# that it functions correctly but is slow. To address this, you might develop a +# custom fused kernel for the specific series of operations. In a compiler environment, +# you can simply perform the initial step and then compile and benefit from improved performance. + + +############################################################################### +# MultiheadAttention +# ------------------ +# Remember that MultiheadAttention takes in a query, key, and value, and consists +# of an input projection, a ``scaled_dot_product_attention`` operator and an +# output projection. The main takeaway we want to demonstrate here is the +# improvement yielded when we replaced padded/masked inputs with nested tensors. +# The improvements are threefold: +# +# * **User Experience** +# Remember that ``nn.MultiheadAttention`` requires ``query``, ``key``, and +# ``value`` to be dense ``torch.Tensors``. It also provides a +# ``key_padding_mask`` that is used to mask out padding tokens in the ``key`` +# that arise due to different sequence lengths within a batch. Since there is +# no ``query_padding_mask`` in ``nn.MHA``, users have to take care to mask/slice +# the outputs appropriately to account for query sequence lengths. ``NestedTensor`` +# cleanly removes the need for this sort of error-prone padding masks. +# +# * **Memory** +# Instead of materializing a dense ``[B, S, D]`` tensor with a ``[B, S]`` +# padding mask (where ``B`` is batch size, ``S`` is max sequence length in the +# batch and ``D`` is embedding size), nested tensors allow you to cleanly +# represent the batch of varying sequence lengths. As a result, the input and +# intermediate activations will use less memory. +# +# * **Performance** +# Since padding is not materialized and unnecessary computation on padding is +# skipped, performance and memory usage improve. +# +# We'll demonstrate the above by building upon the ``MultiheadAttention`` layer in the +# `Nested Tensor tutorial `_ +# and comparing it to the ``nn.MultiheadAttention`` layer. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class MultiHeadAttention(nn.Module): + """ + Computes multi-head attention. Supports nested or padded tensors. + + Args: + E_q (int): Size of embedding dim for query + E_k (int): Size of embedding dim for key + E_v (int): Size of embedding dim for value + E_total (int): Total embedding dim of combined heads post input projection. Each head + has dim E_total // nheads + nheads (int): Number of heads + dropout (float, optional): Dropout probability. Default: 0.0 + bias (bool, optional): Whether to add bias to input projection. Default: True + """ + + def __init__( + self, + E_q: int, + E_k: int, + E_v: int, + E_total: int, + nheads: int, + dropout: float = 0.0, + bias=True, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.nheads = nheads + self.dropout = dropout + self._qkv_same_embed_dim = E_q == E_k and E_q == E_v + if self._qkv_same_embed_dim: + self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs) + else: + self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.k_proj = nn.Linear(E_k, E_total, bias=bias, **factory_kwargs) + self.v_proj = nn.Linear(E_v, E_total, bias=bias, **factory_kwargs) + E_out = E_q + self.out_proj = nn.Linear(E_total, E_out, bias=bias, **factory_kwargs) + assert E_total % nheads == 0, "Embedding dim is not divisible by nheads" + self.E_head = E_total // nheads + self.bias = bias + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask=None, + is_causal=False, + ) -> torch.Tensor: + """ + Forward pass; runs the following process: + 1. Apply input projection + 2. Split heads and prepare for SDPA + 3. Run SDPA + 4. Apply output projection + + Args: + query (torch.Tensor): query of shape (``N``, ``L_q``, ``E_qk``) + key (torch.Tensor): key of shape (``N``, ``L_kv``, ``E_qk``) + value (torch.Tensor): value of shape (``N``, ``L_kv``, ``E_v``) + attn_mask (torch.Tensor, optional): attention mask of shape (``N``, ``L_q``, ``L_kv``) to pass to SDPA. Default: None + is_causal (bool, optional): Whether to apply causal mask. Default: False + + Returns: + attn_output (torch.Tensor): output of shape (N, L_t, E_q) + """ + # Step 1. Apply input projection + if self._qkv_same_embed_dim: + if query is key and key is value: + result = self.packed_proj(query) + query, key, value = torch.chunk(result, 3, dim=-1) + else: + q_weight, k_weight, v_weight = torch.chunk( + self.packed_proj.weight, 3, dim=0 + ) + if self.bias: + q_bias, k_bias, v_bias = torch.chunk( + self.packed_proj.bias, 3, dim=0 + ) + else: + q_bias, k_bias, v_bias = None, None, None + query, key, value = ( + F.linear(query, q_weight, q_bias), + F.linear(key, k_weight, k_bias), + F.linear(value, v_weight, v_bias), + ) + + else: + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Step 2. Split heads and prepare for SDPA + # reshape query, key, value to separate by head + # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head) + query = query.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + key = key.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + value = value.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + + # Step 3. Run SDPA + # (N, nheads, L_t, E_head) + attn_output = F.scaled_dot_product_attention( + query, key, value, dropout_p=self.dropout, is_causal=is_causal + ) + # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total) + attn_output = attn_output.transpose(1, 2).flatten(-2) + + # Step 4. Apply output projection + # (N, L_t, E_total) -> (N, L_t, E_out) + attn_output = self.out_proj(attn_output) + + return attn_output + + +############################################################################### +# Utilities +# ~~~~~~~~~ +# In this section, we include a utility to generate semi-realistic data using +# ``Zipf`` distribution for sentence lengths. This is used to generate the nested +# query, key, and value tensors. We also include a benchmark utility. + + +import numpy as np + + +def zipf_sentence_lengths(alpha: float, batch_size: int) -> torch.Tensor: + # generate fake corpus by unigram Zipf distribution + # from wikitext-2 corpus, we get rank "." = 3, "!" = 386, "?" = 858 + sentence_lengths = np.empty(batch_size, dtype=int) + for ibatch in range(batch_size): + sentence_lengths[ibatch] = 1 + word = np.random.zipf(alpha) + while word != 3 and word != 386 and word != 858: + sentence_lengths[ibatch] += 1 + word = np.random.zipf(alpha) + return torch.tensor(sentence_lengths) + + +# Generate a batch of semi-realistic data using Zipf distribution for sentence lengths +# in the form of nested tensors with the jagged layout. +def gen_batch(N, E_q, E_k, E_v, device, dtype=torch.float32, query_seq_len_1=False): + # generate semi-realistic data using Zipf distribution for sentence lengths + sentence_lengths = zipf_sentence_lengths(alpha=1.2, batch_size=N) + + # Note: the torch.jagged layout is a nested tensor layout that supports a single ragged + # dimension and works with torch.compile. The batch items each have shape (B, S*, D) + # where B = batch size, S* = ragged sequence length, and D = embedding dimension. + if query_seq_len_1: + query = torch.nested.nested_tensor( + [torch.randn(1, E_q, dtype=dtype, device=device) for l in sentence_lengths], + layout=torch.jagged, + ) + else: + query = torch.nested.nested_tensor( + [ + torch.randn(l.item(), E_q, dtype=dtype, device=device) + for l in sentence_lengths + ], + layout=torch.jagged, + ) + + key = torch.nested.nested_tensor( + [ + torch.randn(s.item(), E_k, dtype=dtype, device=device) + for s in sentence_lengths + ], + layout=torch.jagged, + ) + + value = torch.nested.nested_tensor( + [ + torch.randn(s.item(), E_v, dtype=dtype, device=device) + for s in sentence_lengths + ], + layout=torch.jagged, + ) + + return query, key, value, sentence_lengths + + +import math +import timeit + + +def benchmark(func, *args, **kwargs): + torch.cuda.synchronize() + torch.cuda.reset_peak_memory_stats() + begin = timeit.default_timer() + output = func(*args, **kwargs) + torch.cuda.synchronize() + end = timeit.default_timer() + return output, (end - begin), torch.cuda.max_memory_allocated() + + +############################################################################## +# We will now demonstrate the performance improvements of using nested tensors +# in the ``MultiheadAttention`` layer + compile for self attention. We compare this against +# the traditional ``nn.MultiheadAttention`` + compile with padding and masking. + +N, E_q, E_k, E_v, E_total = 512, 512, 512, 512, 512 +E_out = E_q +d_model = E_q +nheads = 8 +dropout = 0.0 +bias = True +device = "cuda" +torch.manual_seed(6) +query, key, value, sentence_lengths = gen_batch(N, E_q, E_k, E_v, device) +S = sentence_lengths.max().item() +print( + f"Total sequence length in nested query {sentence_lengths.sum().item()}, max sequence length {S}" +) +padded_query, padded_key, padded_value = ( + t.to_padded_tensor(0.0) for t in (query, key, value) +) + +torch.manual_seed(6) +mha_layer = MultiHeadAttention( + E_q, E_k, E_v, E_total, nheads, dropout=dropout, bias=bias, device="cuda" +) +torch.manual_seed(6) +vanilla_mha_layer = nn.MultiheadAttention( + E_q, nheads, dropout=dropout, batch_first=True, bias=bias, device="cuda" +) + +# ``nn.MultiheadAttention`` uses a non conventional initialization for layers, so do this for exact parity :( +mha_layer.out_proj.weight = nn.Parameter( + vanilla_mha_layer.out_proj.weight.clone().detach() +) +mha_layer.packed_proj.weight = nn.Parameter( + vanilla_mha_layer.in_proj_weight.clone().detach() +) +mha_layer.out_proj.bias = nn.Parameter(vanilla_mha_layer.out_proj.bias.clone().detach()) +mha_layer.packed_proj.bias = nn.Parameter( + vanilla_mha_layer.in_proj_bias.clone().detach() +) + +new_mha_layer = torch.compile(mha_layer) +# warmup compile +nested_result_warmup = new_mha_layer(query, query, query, is_causal=True) + +# benchmark +nested_result, nested_time, nested_peak_memory = benchmark( + new_mha_layer, query, query, query, is_causal=True +) +padded_nested_result = nested_result.to_padded_tensor(0.0) + +# For the vanilla ``nn.MultiheadAttention``, we need to construct the ``key_padding_mask`` +# Further, ``nn.MultiheadAttention`` forces one to materialize the ``attn_mask`` even if using ``is_causal`` +src_key_padding_mask = torch.where(padded_query == 0.0, -math.inf, 0)[:, :, 0] +attn_mask = torch.empty((N, S, S), device=device).fill_(float("-inf")) +for i, s in enumerate(sentence_lengths): + attn_mask[i, :s, :s] = nn.Transformer.generate_square_subsequent_mask(s) +attn_mask = attn_mask.unsqueeze(1).expand(N, nheads, S, S).reshape(N * nheads, S, S) + +vanilla_mha_layer = torch.compile(vanilla_mha_layer) +# warmup compile +warmup_vanilla_result = vanilla_mha_layer( + padded_query, + padded_query, + padded_query, + attn_mask=attn_mask, + key_padding_mask=src_key_padding_mask, + need_weights=False, + is_causal=True, +) + +# benchmark +(padded_result, _), padded_time, padded_peak_memory = benchmark( + vanilla_mha_layer, + padded_query, + padded_query, + padded_query, + key_padding_mask=src_key_padding_mask, + need_weights=False, + attn_mask=attn_mask, + is_causal=True, +) + +print(f"{padded_time=:.5f}, padded_peak_memory={padded_peak_memory/1e9:.2f} GB") +print(f"{nested_time=:.5f}, nested_peak_memory={nested_peak_memory/1e9:.2f} GB") +print( + "Max difference between vanilla and nested result", + (padded_result - padded_nested_result).abs().max().item(), +) +print(f"Nested speedup: {(padded_time/nested_time):.2f}") +print( + f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB" +) + +###################################################################################### +# For reference, here are some sample outputs on A100: +# +# .. code:: +# +# padded_time=0.03454, padded_peak_memory=4.14 GB +# nested_time=0.00612, nested_peak_memory=0.76 GB +# Max difference between vanilla and nested result 0.0 +# Nested speedup: 5.65 +# Nested peak memory reduction 3.39 GB +# +# We can also see the same for backward pass + +for i, entry_length in enumerate(sentence_lengths): + # padding-specific step: remove output projection bias from padded entries for fair comparison + padded_result[i, entry_length:, :] = 0.0 + +_, padded_bw_time, padded_bw_peak_mem = benchmark( + lambda: padded_result.sum().backward() +) +_, nested_bw_time, nested_bw_peak_mem = benchmark( + lambda: padded_nested_result.sum().backward() +) + +print(f"{padded_bw_time=:.5f}, padded_bw_peak_mem={padded_bw_peak_mem/1e9:.2f} GB") +print(f"{nested_bw_time=:.5f}, nested_bw_peak_mem={nested_bw_peak_mem/1e9:.2f} GB") +print(f"Nested backward speedup: {(padded_bw_time/nested_bw_time):.2f}") +print( + f"Nested backward peak memory reduction {((padded_bw_peak_mem - nested_bw_peak_mem)/1e9):.2f} GB" +) + +print( + "Difference in out_proj.weight.grad", + (mha_layer.out_proj.weight.grad - vanilla_mha_layer.out_proj.weight.grad) + .abs() + .max() + .item(), +) +print( + "Difference in packed_proj.weight.grad", + (mha_layer.packed_proj.weight.grad - vanilla_mha_layer.in_proj_weight.grad) + .abs() + .max() + .item(), +) +print( + "Difference in out_proj.bias.grad", + (mha_layer.out_proj.bias.grad - vanilla_mha_layer.out_proj.bias.grad) + .abs() + .max() + .item(), +) +print( + "Difference in packed_proj.bias.grad", + (mha_layer.packed_proj.bias.grad - vanilla_mha_layer.in_proj_bias.grad) + .abs() + .max() + .item(), +) + +################################################################################## +# Sample outputs on A100: +# +# .. code:: +# +# padded_bw_time=2.09337, padded_bw_peak_mem=5.10 GB +# nested_bw_time=0.01452, nested_bw_peak_mem=3.24 GB +# Nested backward speedup: 144.13 +# Nested backward peak memory reduction 1.86 GB +# Difference in out_proj.weight.grad 0.000244140625 +# Difference in packed_proj.weight.grad 0.001556396484375 +# Difference in out_proj.bias.grad 0.0 +# Difference in packed_proj.bias.grad 0.001953125 +# + +################################################################################## +# GPT-style layer +# --------------- +# A basic GPT-style transformer layer consists of a causal self-attention layer +# followed by a feed-forward network (FFN) with skip connections. Implementing +# this is fairly straightforward using the ``MultiheadAttention`` layer above and +# gives equivalent results to an ``nn.TransformerEncoderLayer`` with +# ``is_causal=True``. +# +# We demonstrate examples of implementing the rest of the ``nn`` layers +# `here `_ +# but omit that from this tutorial for brevity. + + +############################################################################### +# Going one step further +# ---------------------- +# So far, we have demonstrated how to implement a performant ``MultiheadAttention`` +# layer that follows the traditional ``nn.MultiheadAttention``. Going back to our +# classification of modifications to the transformer architecture, remember that we +# classified the modifications into layer type, layer ordering, and modifications +# to the attention score. We trust that changing layer type and layer ordering +# (such as swapping ``LayerNorm`` for ``RMSNorm``) is fairly straightforward. +# +# In this section, we will discuss various functionalities using the +# aforementioned building blocks, including the following: +# +# * Cross Attention +# * Fully masked rows no longer cause NaNs +# * Packed Projection + +############################################################################### +# Cross Attention +# --------------- +# Cross attention is a form of attention where the query and key/value tensors +# are from different sequences. +# +# One example of this is in ``nn.TransformerDecoderLayer`` where the query comes +# from the decoder and the key/value come from the encoder. +# +# The above MultiheadAttention layer nicely generalizes to this case with nested +# tensors for both query and key/value. + +query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device) +_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device) + +print( + f"Total sequence length in nested query {q_len.sum().item()}, max sequence length {q_len.max().item()}" +) +print( + f"Total sequence length in nested key/value {kv_len.sum().item()}, max sequence length {kv_len.max().item()}" +) +out = new_mha_layer(query, key, value, is_causal=False) + +######################################################################################## +# As above, we can compare this against the vanilla compiled ``nn.MultiheadAttention``. + +torch.manual_seed(6) +query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device) +_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device) +padded_query, padded_key, padded_value = ( + t.to_padded_tensor(0.0) for t in (query, key, value) +) + +key_padding_mask = torch.where(padded_key == 0.0, -math.inf, 0)[:, :, 0] + +# warmup compile +warmup_nested_result = new_mha_layer(query, key, value, is_causal=False) +warmup_vanilla_result = vanilla_mha_layer( + padded_query, + padded_key, + padded_value, + key_padding_mask=key_padding_mask, + need_weights=False, + is_causal=False, +) + +nested_result, nested_time, nested_peak_memory = benchmark( + new_mha_layer, query, key, value, is_causal=False +) +(padded_result, _), padded_time, padded_peak_memory = benchmark( + vanilla_mha_layer, + padded_query, + padded_key, + padded_value, + key_padding_mask=key_padding_mask, + need_weights=False, + is_causal=False, +) +padded_nested_result = nested_result.to_padded_tensor(0.0) +for i, entry_length in enumerate(q_len): + # padding-specific step: remove output projection bias from padded entries for fair comparison + padded_result[i, entry_length:, :] = 0.0 + +print( + "Max difference between vanilla and nested result", + (padded_result - padded_nested_result).abs().max().item(), +) +print(f"Nested speedup: {(padded_time/nested_time):.2f}") +print( + f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB" +) + +################################################################################## +# Sample outputs on A100: +# +# .. code:: +# +# Max difference between vanilla and nested result 0.0 +# Nested speedup: 4.01 +# Nested peak memory reduction 1.40 GB +# + +################################################################################ +# Fully masked rows no longer cause NaNs +# -------------------------------------- +# +# There has been a long standing issue with ``nn.MultiheadAttention`` and +# ``scaled_dot_product_attention`` where if a row was fully masked out, the output +# of the attention layer would be NaN. See `issue `_. +# This is because the softmax over an empty set is undefined. +# +# Thanks to `this PR `_ +# this is no longer the case. Instead, the output corresponding to fully masked rows +# in ``scaled_dot_product_attention`` will be 0. For cases where ``nn.MHA`` does +# not employ the "fast-path", this will also apply. +# +# Using a custom MHA layer with NJTs is strongly recommended over the +# existing "fast-path" in ``nn.MultiheadAttention`` as NJT's ability to model raggedness +# appropriately makes it possible to properly express empty sequences. + + +############################################################################### +# Packed Projection +# ----------------- +# +# Packed projection is a technique that makes use of the fact that when the input +# for projection (matrix multiplications) are the same (self-attention), we can pack the projection +# weights and biases into single tensors. It is especially useful when the individual +# projections are memory bound rather than compute bound. There are +# two examples that we will demonstrate here: +# +# * Input projection for MultiheadAttention +# * SwiGLU activation in feed-forward network of Transformer Layer +# +# Input projection for MultiheadAttention +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# When doing self-attention, the ``query``, ``key``, and ``value`` +# are the same tensor. Each of these tensors is projected with a +# ``Linear(E_q, E_total)`` layer. Instead, we can pack this into one layer, +# which is what we do in the MultiheadAttention layer above. +# +# Let us compare the performance of the packed projection against the usual method: + + +class InputProjection(nn.Module): + def __init__(self, E_q, E_total, bias=False, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.k_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.v_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + + def forward(self, x): + return self.q_proj(x), self.k_proj(x), self.v_proj(x) + + +class PackedInputProjection(nn.Module): + def __init__(self, E_q, E_total, bias=False, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs) + + def forward(self, query): + return torch.chunk(self.packed_proj(query), 3, dim=-1) + + +B, D, dtype = 256, 8192, torch.bfloat16 + +torch.set_float32_matmul_precision("high") +in_proj = torch.compile(InputProjection(D, D, device="cuda", dtype=torch.bfloat16)) +packed_in_proj = torch.compile( + PackedInputProjection(D, D, device="cuda", dtype=torch.bfloat16) +) + +q, _, _, sequence_lengths = gen_batch(B, D, D, D, device="cuda", dtype=torch.bfloat16) + +# warmup +in_proj(q) +packed_in_proj(q) + +# benchmark +(q_out, k_out, v_out), time, _ = benchmark(in_proj, q) +(q_out, k_out, v_out), time_packed, _ = benchmark(packed_in_proj, q) +# On my A100 prints 1.05x speedup +print( + f"InputProjection: {time:5f} s, PackedInputProjection: {time_packed:5f} s, speedup: {time/time_packed:.2f}x" +) + +################################################## +# SwiGLU feed forward network of Transformer Layer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Swish-Gated Linear Unit (SwiGLU) is a non-linear activation function that is increasingly popular in the feed-forward +# network of the transformer layer (e.g. Llama). A feed-forward network with SwiGLU activation is defined as: + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + dim, + hidden_dim, + multiple_of, + ffn_dim_multiplier=None, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs) + self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs) + self.w3 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +######################################################################## +# An alternative way of implementing this that uses packed projection is + + +class PackedSwiGLUFFN(nn.Module): + def __init__( + self, + dim, + hidden_dim, + multiple_of, + ffn_dim_multiplier=None, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w13 = nn.Linear(dim, 2 * hidden_dim, bias=False, **factory_kwargs) + self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs) + + def forward(self, x): + x1, x3 = torch.chunk(self.w13(x), 2, dim=-1) + return self.w2(F.silu(x1) * x3) + + +################################################################################ +# We can compare the performance of the two implementations as follows +# Depending on your hardware, you might see different results. On an A100 I see +# 1.12x speedup for D=128. +D = 128 + +swigluffn = torch.compile(SwiGLUFFN(D, D * 4, 256, device="cuda", dtype=torch.bfloat16)) +packed_swigluffn = torch.compile( + PackedSwiGLUFFN(D, D * 4, 256, device="cuda", dtype=torch.bfloat16) +) + +q, _, _, sentence_lengths = gen_batch(D, D, D, D, device="cuda", dtype=torch.bfloat16) + +# warmup +swigluffn(q) +packed_swigluffn(q) + +# benchmark +_, time, _ = benchmark(swigluffn, q) +_, time_packed, _ = benchmark(packed_swigluffn, q) +# On my A100 prints 1.08x speedup +print( + f"SwiGLUFFN: {time} s, PackedSwiGLUFFN: {time_packed} s, speedup: {time/time_packed:.2f}x" +) + +################################################################################ +# Extended examples +# ----------------- +# +# We intend to update this tutorial to demonstrate more examples of how to use +# the various performant building blocks such as KV-Caching, Grouped Query Attention +# etc. Further, there are several good examples of using various performant building blocks to +# implement various transformer architectures. Some examples include +# +# * `gpt-fast `_ +# * `segment-anything-fast `_ +# * `lucidrains implementation of NaViT with nested tensors `_ +# * `torchtune's implementation of VisionTransformer `_ + +################################################################################ +# Conclusion +# ---------- +# +# In this tutorial, we have introduced the low level building blocks PyTorch +# provides for writing transformer layers and demonstrated examples how to compose +# them. It is our hope that this tutorial has educated the reader on the ease with +# which flexible and performant transformer layers can be implemented by users of PyTorch. diff --git a/intermediate_source/visualizing_gradients_tutorial.py b/intermediate_source/visualizing_gradients_tutorial.py new file mode 100644 index 00000000000..ff78fa3f01a --- /dev/null +++ b/intermediate_source/visualizing_gradients_tutorial.py @@ -0,0 +1,298 @@ +""" +Visualizing Gradients +===================== + +**Author:** `Justin Silver `__ + +This tutorial explains how to extract and visualize gradients at any +layer in a neural network. By inspecting how information flows from the +end of the network to the parameters we want to optimize, we can debug +issues such as `vanishing or exploding +gradients `__ that occur during +training. + +Before starting, make sure you understand `tensors and how to manipulate +them `__. +A basic knowledge of `how autograd +works `__ +would also be useful. + +""" + + +###################################################################### +# Setup +# ----- +# +# First, make sure `PyTorch is +# installed `__ and then import +# the necessary libraries. +# + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import matplotlib.pyplot as plt + + +###################################################################### +# Next, we’ll be creating a network intended for the MNIST dataset, +# similar to the architecture described by the `batch normalization +# paper `__. +# +# To illustrate the importance of gradient visualization, we will +# instantiate one version of the network with batch normalization +# (BatchNorm), and one without it. Batch normalization is an extremely +# effective technique to resolve `vanishing/exploding +# gradients `__, and we will be verifying +# that experimentally. +# +# The model we use has a configurable number of repeating fully-connected +# layers which alternate between ``nn.Linear``, ``norm_layer``, and +# ``nn.Sigmoid``. If batch normalization is enabled, then ``norm_layer`` +# will use +# `BatchNorm1d `__, +# otherwise it will use the +# `Identity `__ +# transformation. +# + +def fc_layer(in_size, out_size, norm_layer): + """Return a stack of linear->norm->sigmoid layers""" + return nn.Sequential(nn.Linear(in_size, out_size), norm_layer(out_size), nn.Sigmoid()) + +class Net(nn.Module): + """Define a network that has num_layers of linear->norm->sigmoid transformations""" + def __init__(self, in_size=28*28, hidden_size=128, + out_size=10, num_layers=3, batchnorm=False): + super().__init__() + if batchnorm is False: + norm_layer = nn.Identity + else: + norm_layer = nn.BatchNorm1d + + layers = [] + layers.append(fc_layer(in_size, hidden_size, norm_layer)) + + for i in range(num_layers-1): + layers.append(fc_layer(hidden_size, hidden_size, norm_layer)) + + layers.append(nn.Linear(hidden_size, out_size)) + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + x = torch.flatten(x, 1) + return self.layers(x) + + +###################################################################### +# Next we set up some dummy data, instantiate two versions of the model, +# and initialize the optimizers. +# + +# set up dummy data +x = torch.randn(10, 28, 28) +y = torch.randint(10, (10, )) + +# init model +model_bn = Net(batchnorm=True, num_layers=3) +model_nobn = Net(batchnorm=False, num_layers=3) + +model_bn.train() +model_nobn.train() + +optimizer_bn = optim.SGD(model_bn.parameters(), lr=0.01, momentum=0.9) +optimizer_nobn = optim.SGD(model_nobn.parameters(), lr=0.01, momentum=0.9) + + + +###################################################################### +# We can verify that batch normalization is only being applied to one of +# the models by probing one of the internal layers: +# + +print(model_bn.layers[0]) +print(model_nobn.layers[0]) + + +###################################################################### +# Registering hooks +# ----------------- +# + + +###################################################################### +# Because we wrapped up the logic and state of our model in a +# ``nn.Module``, we need another method to access the intermediate +# gradients if we want to avoid modifying the module code directly. This +# is done by `registering a +# hook `__. +# +# .. warning:: +# +# Using backward pass hooks attached to output tensors is preferred over using ``retain_grad()`` on the tensors themselves. An alternative method is to directly attach module hooks (e.g. ``register_full_backward_hook()``) so long as the ``nn.Module`` instance does not do perform any in-place operations. For more information, please refer to `this issue `__. +# +# The following code defines our hooks and gathers descriptive names for +# the network’s layers. +# + +# note that wrapper functions are used for Python closure +# so that we can pass arguments. + +def hook_forward(module_name, grads, hook_backward): + def hook(module, args, output): + """Forward pass hook which attaches backward pass hooks to intermediate tensors""" + output.register_hook(hook_backward(module_name, grads)) + return hook + +def hook_backward(module_name, grads): + def hook(grad): + """Backward pass hook which appends gradients""" + grads.append((module_name, grad)) + return hook + +def get_all_layers(model, hook_forward, hook_backward): + """Register forward pass hook (which registers a backward hook) to model outputs + + Returns: + - layers: a dict with keys as layer/module and values as layer/module names + e.g. layers[nn.Conv2d] = layer1.0.conv1 + - grads: a list of tuples with module name and tensor output gradient + e.g. grads[0] == (layer1.0.conv1, tensor.Torch(...)) + """ + layers = dict() + grads = [] + for name, layer in model.named_modules(): + # skip Sequential and/or wrapper modules + if any(layer.children()) is False: + layers[layer] = name + layer.register_forward_hook(hook_forward(name, grads, hook_backward)) + return layers, grads + +# register hooks +layers_bn, grads_bn = get_all_layers(model_bn, hook_forward, hook_backward) +layers_nobn, grads_nobn = get_all_layers(model_nobn, hook_forward, hook_backward) + + +###################################################################### +# Training and visualization +# -------------------------- +# +# Let’s now train the models for a few epochs: +# + +epochs = 10 + +for epoch in range(epochs): + + # important to clear, because we append to + # outputs everytime we do a forward pass + grads_bn.clear() + grads_nobn.clear() + + optimizer_bn.zero_grad() + optimizer_nobn.zero_grad() + + y_pred_bn = model_bn(x) + y_pred_nobn = model_nobn(x) + + loss_bn = F.cross_entropy(y_pred_bn, y) + loss_nobn = F.cross_entropy(y_pred_nobn, y) + + loss_bn.backward() + loss_nobn.backward() + + optimizer_bn.step() + optimizer_nobn.step() + + +###################################################################### +# After running the forward and backward pass, the gradients for all the +# intermediate tensors should be present in ``grads_bn`` and +# ``grads_nobn``. We compute the mean absolute value of each gradient +# matrix so that we can compare the two models. +# + +def get_grads(grads): + layer_idx = [] + avg_grads = [] + for idx, (name, grad) in enumerate(grads): + if grad is not None: + avg_grad = grad.abs().mean() + avg_grads.append(avg_grad) + # idx is backwards since we appended in backward pass + layer_idx.append(len(grads) - 1 - idx) + return layer_idx, avg_grads + +layer_idx_bn, avg_grads_bn = get_grads(grads_bn) +layer_idx_nobn, avg_grads_nobn = get_grads(grads_nobn) + + +###################################################################### +# With the average gradients computed, we can now plot them and see how +# the values change as a function of the network depth. Notice that when +# we don’t apply batch normalization, the gradient values in the +# intermediate layers fall to zero very quickly. The batch normalization +# model, however, maintains non-zero gradients in its intermediate layers. +# + +fig, ax = plt.subplots() +ax.plot(layer_idx_bn, avg_grads_bn, label="With BatchNorm", marker="o") +ax.plot(layer_idx_nobn, avg_grads_nobn, label="Without BatchNorm", marker="x") +ax.set_xlabel("Layer depth") +ax.set_ylabel("Average gradient") +ax.set_title("Gradient flow") +ax.grid(True) +ax.legend() +plt.show() + + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we demonstrated how to visualize the gradient flow +# through a neural network wrapped in a ``nn.Module`` class. We +# qualitatively showed how batch normalization helps to alleviate the +# vanishing gradient issue which occurs with deep neural networks. +# +# If you would like to learn more about how PyTorch’s autograd system +# works, please visit the `references <#references>`__ below. If you have +# any feedback for this tutorial (improvements, typo fixes, etc.) then +# please use the `PyTorch Forums `__ and/or +# the `issue tracker `__ to +# reach out. +# + + +###################################################################### +# (Optional) Additional exercises +# ------------------------------- +# +# - Try increasing the number of layers (``num_layers``) in our model and +# see what effect this has on the gradient flow graph +# - How would you adapt the code to visualize average activations instead +# of average gradients? (*Hint: in the hook_forward() function we have +# access to the raw tensor output*) +# - What are some other methods to deal with vanishing and exploding +# gradients? +# + + +###################################################################### +# References +# ---------- +# +# - `A Gentle Introduction to +# torch.autograd `__ +# - `Automatic Differentiation with +# torch.autograd `__ +# - `Autograd +# mechanics `__ +# - `Batch Normalization: Accelerating Deep Network Training by Reducing +# Internal Covariate Shift `__ +# - `On the difficulty of training Recurrent Neural +# Networks `__ +# \ No newline at end of file diff --git a/intro.rst b/intro.rst new file mode 100644 index 00000000000..1f9c8ceb4ef --- /dev/null +++ b/intro.rst @@ -0,0 +1,66 @@ +Intro +===== + +This is a collection of beginner-friendly resources to help you get +started with PyTorch. These tutorials cover fundamental concepts, +basic operations, and essential workflows to build a solid foundation +for your deep learning journey. Perfect for newcomers looking to +understand PyTorch's core functionality through step-by-step guidance. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`file-code;1em` + Learn the Basics + :link: https://pytorch.org/tutorials/beginner/basics/intro.html + :link-type: url + + A step-by-step interactive series for those just starting out + with PyTorch. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + Introduction to PyTorch - YouTube Series + :link: https://pytorch.org/tutorials/beginner/introyt/introyt_index.html + :link-type: url + + Learn the fundamentals of PyTorch through our video series, + perfect for those who prefer learning from videos. + +++ + :octicon:`code;1em` Code :octicon:`square-fill;1em` :octicon:`video;1em` Video + +.. grid:: 2 + + .. grid-item-card:: :octicon:`file-code;1em` + Learning PyTorch + :link: https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html + :link-type: url + + Quickly grasp the basics of PyTorch with these bite-sized + foundational recipes. + +++ + :octicon:`code;1em` Code + +.. toctree:: + :maxdepth: 1 + :hidden: + :includehidden: + :caption: Getting Started with PyTorch + + beginner/basics/intro + beginner/introyt/introyt_index + +.. toctree:: + :maxdepth: 1 + :hidden: + :includehidden: + :caption: Learning PyTorch + + beginner/deep_learning_60min_blitz + beginner/pytorch_with_examples + beginner/nn_tutorial + beginner/understanding_leaf_vs_nonleaf_tutorial + intermediate/nlp_from_scratch_index + intermediate/tensorboard_tutorial + intermediate/pinmem_nonblock + intermediate/visualizing_gradients_tutorial diff --git a/lychee.toml b/lychee.toml new file mode 100644 index 00000000000..b07496f7876 --- /dev/null +++ b/lychee.toml @@ -0,0 +1 @@ +exclude_path = [".jenkins/build.sh", "_static/img/", "_static/images/"] diff --git a/prototype_source/README.txt b/prototype_source/README.txt deleted file mode 100644 index 4ab9ce8f6a9..00000000000 --- a/prototype_source/README.txt +++ /dev/null @@ -1,38 +0,0 @@ -Prototype Tutorials ------------------- -1. distributed_rpc_profiling.rst - Profiling PyTorch RPC-Based Workloads - https://github.com/pytorch/tutorials/blob/main/prototype_source/distributed_rpc_profiling.rst - -2. graph_mode_static_quantization_tutorial.py - Graph Mode Post Training Static Quantization in PyTorch - https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html - -3. graph_mode_dynamic_bert_tutorial.rst - Graph Mode Dynamic Quantization on BERT - https://github.com/pytorch/tutorials/blob/main/prototype_source/graph_mode_dynamic_bert_tutorial.rst - -4. numeric_suite_tutorial.py - PyTorch Numeric Suite Tutorial - https://github.com/pytorch/tutorials/blob/main/prototype_source/numeric_suite_tutorial.py - -5. torchscript_freezing.py - Model Freezing in TorchScript - https://github.com/pytorch/tutorials/blob/main/prototype_source/torchscript_freezing.py - -6. vulkan_workflow.rst - Vulkan Backend User Workflow - https://pytorch.org/tutorials/intermediate/vulkan_workflow.html - -7. fx_graph_mode_ptq_static.rst - FX Graph Mode Post Training Static Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html - -8. fx_graph_mode_ptq_dynamic.py - FX Graph Mode Post Training Dynamic Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html - -9. fx_graph_mode_quant_guide.py - FX Graph Mode Quantization User Guide - https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html - diff --git a/prototype_source/backend_config_tutorial.rst b/prototype_source/backend_config_tutorial.rst deleted file mode 100644 index ba6729285e5..00000000000 --- a/prototype_source/backend_config_tutorial.rst +++ /dev/null @@ -1,326 +0,0 @@ -(prototype) PyTorch BackendConfig Tutorial -========================================== -**Author**: `Andrew Or `_ - -The BackendConfig API enables developers to integrate their backends -with PyTorch quantization. It is currently only supported in FX graph -mode quantization, but support may be extended to other modes of -quantization in the future. In this tutorial, we will demonstrate how to -use this API to customize quantization support for specific backends. -For more information on the motivation and implementation details behind -BackendConfig, please refer to this -`README `__. - -Suppose we are a backend developer and we wish to integrate our backend -with PyTorch's quantization APIs. Our backend consists of two ops only: -quantized linear and quantized conv-relu. In this section, we will walk -through how to achieve this by quantizing an example model using a custom -BackendConfig through `prepare_fx` and `convert_fx`. - -.. code:: ipython3 - - import torch - from torch.ao.quantization import ( - default_weight_observer, - get_default_qconfig_mapping, - MinMaxObserver, - QConfig, - QConfigMapping, - ) - from torch.ao.quantization.backend_config import ( - BackendConfig, - BackendPatternConfig, - DTypeConfig, - DTypeWithConstraints, - ObservationType, - ) - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx - -1. Derive reference pattern for each quantized operator --------------------------------------------------------- - -For quantized linear, suppose our backend expects the reference pattern -`[dequant - fp32_linear - quant]` and lowers it into a single quantized -linear op. The way to achieve this is to first insert quant-dequant ops -before and after the float linear op, such that we produce the following -reference model:: - - quant1 - [dequant1 - fp32_linear - quant2] - dequant2 - -Similarly, for quantized conv-relu, we wish to produce the following -reference model, where the reference pattern in the square brackets will -be lowered into a single quantized conv-relu op:: - - quant1 - [dequant1 - fp32_conv_relu - quant2] - dequant2 - -2. Set DTypeConfigs with backend constraints ---------------------------------------------- - -In the reference patterns above, the input dtype specified in the -DTypeConfig will be passed as the dtype argument to quant1, while the -output dtype will be passed as the dtype argument to quant2. If the output -dtype is fp32, as in the case of dynamic quantization, then the output -quant-dequant pair will not be inserted. This example also shows how to -specify restrictions on quantization and scale ranges on a particular dtype. - -.. code:: ipython3 - - quint8_with_constraints = DTypeWithConstraints( - dtype=torch.quint8, - quant_min_lower_bound=0, - quant_max_upper_bound=255, - scale_min_lower_bound=2 ** -12, - ) - - # Specify the dtypes passed to the quantized ops in the reference model spec - weighted_int8_dtype_config = DTypeConfig( - input_dtype=quint8_with_constraints, - output_dtype=quint8_with_constraints, - weight_dtype=torch.qint8, - bias_dtype=torch.float) - -3. Set up fusion for conv-relu -------------------------------- - -Note that the original user model contains separate conv and relu ops, -so we need to first fuse the conv and relu ops into a single conv-relu -op (`fp32_conv_relu`), and then quantize this op similar to how the linear -op is quantized. We can set up fusion by defining a function that accepts -3 arguments, where the first is whether or not this is for QAT, and the -remaining arguments refer to the individual items of the fused pattern. - -.. code:: ipython3 - - def fuse_conv2d_relu(is_qat, conv, relu): - """Return a fused ConvReLU2d from individual conv and relu modules.""" - return torch.ao.nn.intrinsic.ConvReLU2d(conv, relu) - -4. Define the BackendConfig ----------------------------- - -Now we have all the necessary pieces, so we go ahead and define our -BackendConfig. Here we use different observers (will be renamed) for -the input and output for the linear op, so the quantization params -passed to the two quantize ops (quant1 and quant2) will be different. -This is commonly the case for weighted ops like linear and conv. - -For the conv-relu op, the observation type is the same. However, we -need two BackendPatternConfigs to support this op, one for fusion -and one for quantization. For both conv-relu and linear, we use the -DTypeConfig defined above. - -.. code:: ipython3 - - linear_config = BackendPatternConfig() \ - .set_pattern(torch.nn.Linear) \ - .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \ - .add_dtype_config(weighted_int8_dtype_config) \ - .set_root_module(torch.nn.Linear) \ - .set_qat_module(torch.nn.qat.Linear) \ - .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear) - - # For fusing Conv2d + ReLU into ConvReLU2d - # No need to set observation type and dtype config here, since we are not - # inserting quant-dequant ops in this step yet - conv_relu_config = BackendPatternConfig() \ - .set_pattern((torch.nn.Conv2d, torch.nn.ReLU)) \ - .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \ - .set_fuser_method(fuse_conv2d_relu) - - # For quantizing ConvReLU2d - fused_conv_relu_config = BackendPatternConfig() \ - .set_pattern(torch.ao.nn.intrinsic.ConvReLU2d) \ - .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \ - .add_dtype_config(weighted_int8_dtype_config) \ - .set_root_module(torch.nn.Conv2d) \ - .set_qat_module(torch.ao.nn.intrinsic.qat.ConvReLU2d) \ - .set_reference_quantized_module(torch.ao.nn.quantized.reference.Conv2d) - - backend_config = BackendConfig("my_backend") \ - .set_backend_pattern_config(linear_config) \ - .set_backend_pattern_config(conv_relu_config) \ - .set_backend_pattern_config(fused_conv_relu_config) - -5. Set up QConfigMapping that satisfies the backend constraints ----------------------------------------------------------------- - -In order to use the ops defined above, the user must define a QConfig -that satisfies the constraints specified in the DTypeConfig. For more -detail, see the documentation for `DTypeConfig `__. -We will then use this QConfig for all the modules used in the patterns -we wish to quantize. - -.. code:: ipython3 - - # Note: Here we use a quant_max of 127, but this could be up to 255 (see `quint8_with_constraints`) - activation_observer = MinMaxObserver.with_args(quant_min=0, quant_max=127, eps=2 ** -12) - qconfig = QConfig(activation=activation_observer, weight=default_weight_observer) - - # Note: All individual items of a fused pattern, e.g. Conv2d and ReLU in - # (Conv2d, ReLU), must have the same QConfig - qconfig_mapping = QConfigMapping() \ - .set_object_type(torch.nn.Linear, qconfig) \ - .set_object_type(torch.nn.Conv2d, qconfig) \ - .set_object_type(torch.nn.BatchNorm2d, qconfig) \ - .set_object_type(torch.nn.ReLU, qconfig) - -6. Quantize the model through prepare and convert --------------------------------------------------- - -Finally, we quantize the model by passing the BackendConfig we defined -into prepare and convert. This produces a quantized linear module and -a fused quantized conv-relu module. - -.. code:: ipython3 - - class MyModel(torch.nn.Module): - def __init__(self, use_bn: bool): - super().__init__() - self.linear = torch.nn.Linear(10, 3) - self.conv = torch.nn.Conv2d(3, 3, 3) - self.bn = torch.nn.BatchNorm2d(3) - self.relu = torch.nn.ReLU() - self.sigmoid = torch.nn.Sigmoid() - self.use_bn = use_bn - - def forward(self, x): - x = self.linear(x) - x = self.conv(x) - if self.use_bn: - x = self.bn(x) - x = self.relu(x) - x = self.sigmoid(x) - return x - - example_inputs = (torch.rand(1, 3, 10, 10, dtype=torch.float),) - model = MyModel(use_bn=False) - prepared = prepare_fx(model, qconfig_mapping, example_inputs, backend_config=backend_config) - prepared(*example_inputs) # calibrate - converted = convert_fx(prepared, backend_config=backend_config) - -.. parsed-literal:: - - >>> print(converted) - - GraphModule( - (linear): QuantizedLinear(in_features=10, out_features=3, scale=0.012136868201196194, zero_point=67, qscheme=torch.per_tensor_affine) - (conv): QuantizedConvReLU2d(3, 3, kernel_size=(3, 3), stride=(1, 1), scale=0.0029353597201406956, zero_point=0) - (sigmoid): Sigmoid() - ) - - def forward(self, x): - linear_input_scale_0 = self.linear_input_scale_0 - linear_input_zero_point_0 = self.linear_input_zero_point_0 - quantize_per_tensor = torch.quantize_per_tensor(x, linear_input_scale_0, linear_input_zero_point_0, torch.quint8); x = linear_input_scale_0 = linear_input_zero_point_0 = None - linear = self.linear(quantize_per_tensor); quantize_per_tensor = None - conv = self.conv(linear); linear = None - dequantize_2 = conv.dequantize(); conv = None - sigmoid = self.sigmoid(dequantize_2); dequantize_2 = None - return sigmoid - -(7. Experiment with faulty BackendConfig setups) -------------------------------------------------- - -As an experiment, here we modify the model to use conv-bn-relu -instead of conv-relu, but use the same BackendConfig, which doesn't -know how to quantize conv-bn-relu. As a result, only linear is -quantized, but conv-bn-relu is neither fused nor quantized. - -.. code:: ipython3 - # Only linear is quantized, since there's no rule for fusing conv-bn-relu - example_inputs = (torch.rand(1, 3, 10, 10, dtype=torch.float),) - model = MyModel(use_bn=True) - prepared = prepare_fx(model, qconfig_mapping, example_inputs, backend_config=backend_config) - prepared(*example_inputs) # calibrate - converted = convert_fx(prepared, backend_config=backend_config) - -.. parsed-literal:: - - >>> print(converted) - - GraphModule( - (linear): QuantizedLinear(in_features=10, out_features=3, scale=0.015307803638279438, zero_point=95, qscheme=torch.per_tensor_affine) - (conv): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1)) - (bn): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU() - (sigmoid): Sigmoid() - ) - - def forward(self, x): - linear_input_scale_0 = self.linear_input_scale_0 - linear_input_zero_point_0 = self.linear_input_zero_point_0 - quantize_per_tensor = torch.quantize_per_tensor(x, linear_input_scale_0, linear_input_zero_point_0, torch.quint8); x = linear_input_scale_0 = linear_input_zero_point_0 = None - linear = self.linear(quantize_per_tensor); quantize_per_tensor = None - dequantize_1 = linear.dequantize(); linear = None - conv = self.conv(dequantize_1); dequantize_1 = None - bn = self.bn(conv); conv = None - relu = self.relu(bn); bn = None - sigmoid = self.sigmoid(relu); relu = None - return sigmoid - -As another experiment, here we use the default QConfigMapping that -doesn't satisfy the dtype constraints specified in the backend. As -a result, nothing is quantized since the QConfigs are simply ignored. - -.. code:: ipython3 - # Nothing is quantized or fused, since backend constraints are not satisfied - example_inputs = (torch.rand(1, 3, 10, 10, dtype=torch.float),) - model = MyModel(use_bn=True) - prepared = prepare_fx(model, get_default_qconfig_mapping(), example_inputs, backend_config=backend_config) - prepared(*example_inputs) # calibrate - converted = convert_fx(prepared, backend_config=backend_config) - -.. parsed-literal:: - - >>> print(converted) - - GraphModule( - (linear): Linear(in_features=10, out_features=3, bias=True) - (conv): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1)) - (bn): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU() - (sigmoid): Sigmoid() - ) - - def forward(self, x): - linear = self.linear(x); x = None - conv = self.conv(linear); linear = None - bn = self.bn(conv); conv = None - relu = self.relu(bn); bn = None - sigmoid = self.sigmoid(relu); relu = None - return sigmoid - - -Built-in BackendConfigs ------------------------ - -PyTorch quantization supports a few built-in native BackendConfigs under -the ``torch.ao.quantization.backend_config`` namespace: - -- `get_fbgemm_backend_config `__: - for server target settings -- `get_qnnpack_backend_config `__: - for mobile and edge device target settings, also supports XNNPACK - quantized ops -- `get_native_backend_config `__ - (default): a BackendConfig that supports a union of the operator - patterns supported in the FBGEMM and QNNPACK BackendConfigs - -There are also other BackendConfigs under development (e.g. for -TensorRT and x86), but these are still mostly experimental at the -moment. If the user wishes to integrate a new, custom backend with -PyTorch’s quantization API, they may define their own BackendConfigs -using the same set of APIs used to define the natively supported -ones as in the example above. - -Further Reading ---------------- - -How BackendConfig is used in FX graph mode quantization: -https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/fx/README.md - -Motivation and implementation details behind BackendConfig: -https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/backend_config/README.md - -Early design of BackendConfig: -https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py deleted file mode 100644 index 84d6ccb1832..00000000000 --- a/prototype_source/fx_graph_mode_ptq_dynamic.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -(prototype) FX Graph Mode Post Training Dynamic Quantization -============================================================ - -**Author**: `Jerry Zhang `_ - -This tutorial introduces the steps to do post training dynamic quantization in graph mode based on ``torch.fx``. -We have a separate tutorial for `FX Graph Mode Post Training Static Quantization `_, -comparison between FX Graph Mode Quantization and Eager Mode Quantization can be found in the `quantization docs `_ - -tldr; The FX Graph Mode API for dynamic quantization looks like the following: - -.. code:: python - - import torch - from torch.ao.quantization import default_dynamic_qconfig, QConfigMapping - # Note that this is temporary, we'll expose these functions to torch.ao.quantization after official releasee - from torch.quantization.quantize_fx import prepare_fx, convert_fx - - float_model.eval() - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) # fuse modules and insert observers - # no calibration is required for dynamic quantization - quantized_model = convert_fx(prepared_model) # convert the model to a dynamically quantized model - -In this tutorial, we’ll apply dynamic quantization to an LSTM-based next word-prediction model, -closely following the word language model from the PyTorch examples. -We will copy the code from `Dynamic Quantization on an LSTM Word Language Model `_ -and omit the descriptions. - -""" - - -################################################### -# 1. Define the Model, Download Data and Model -# -------------------------------------------- -# -# Download the `data `_ -# and unzip to data folder -# -# .. code:: -# -# mkdir data -# cd data -# wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -# unzip wikitext-2-v1.zip -# -# Download model to the data folder: -# -# .. code:: -# -# wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -# -# Define the model: - -# imports -import os -from io import open -import time -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F - -# Model Definition -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.drop(self.encoder(input)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - -def init_hidden(lstm_model, bsz): - # get the weight tensor and create hidden layer in the same device - weight = lstm_model.encoder.weight - # get weight from quantized model - if not isinstance(weight, torch.Tensor): - weight = weight() - device = weight.device - nlayers = lstm_model.rnn.num_layers - nhid = lstm_model.rnn.hidden_size - return (torch.zeros(nlayers, bsz, nhid, device=device), - torch.zeros(nlayers, bsz, nhid, device=device)) - - -# Load Text Data -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens')) - self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens')) - self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens')) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r', encoding="utf8") as f: - for line in f: - words = line.split() + [''] - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r', encoding="utf8") as f: - idss = [] - for line in f: - words = line.split() + [''] - ids = [] - for word in words: - ids.append(self.dictionary.word2idx[word]) - idss.append(torch.tensor(ids).type(torch.int64)) - ids = torch.cat(idss) - - return ids - -model_data_filepath = 'data/' - -corpus = Corpus(model_data_filepath + 'wikitext-2') - -ntokens = len(corpus.dictionary) - -# Load Pretrained Model -model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') - ) - ) - -model.eval() -print(model) - -bptt = 25 -criterion = nn.CrossEntropyLoss() -eval_batch_size = 1 - -# create test data set -def batchify(data, bsz): - # Work out how cleanly we can divide the dataset into bsz parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the bsz batches. - return data.view(bsz, -1).t().contiguous() - -test_data = batchify(corpus.test, eval_batch_size) -example_inputs = (next(iter(test_data))[0]) - -# Evaluation functions -def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].reshape(-1) - return data, target - -def repackage_hidden(h): - """Wraps hidden states in new Tensors, to detach them from their history.""" - - if isinstance(h, torch.Tensor): - return h.detach() - else: - return tuple(repackage_hidden(v) for v in h) - -def evaluate(model_, data_source): - # Turn on evaluation mode which disables dropout. - model_.eval() - total_loss = 0. - hidden = init_hidden(model_, eval_batch_size) - with torch.no_grad(): - for i in range(0, data_source.size(0) - 1, bptt): - data, targets = get_batch(data_source, i) - output, hidden = model_(data, hidden) - hidden = repackage_hidden(hidden) - output_flat = output.view(-1, ntokens) - total_loss += len(data) * criterion(output_flat, targets).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# 2. Post Training Dynamic Quantization -# ------------------------------------- -# Now we can dynamically quantize the model. -# We can use the same function as post training static quantization but with a dynamic qconfig. - -from torch.quantization.quantize_fx import prepare_fx, convert_fx -from torch.ao.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig, QConfigMapping - -# Full docs for supported qconfig for floating point modules/ops can be found in `quantization docs `_ -# Full docs for `QConfigMapping `_ -qconfig_mapping = (QConfigMapping() - .set_object_type(nn.Embedding, float_qparams_weight_only_qconfig) - .set_object_type(nn.LSTM, default_dynamic_qconfig) - .set_object_type(nn.Linear, default_dynamic_qconfig) -) -# Load model to create the original model because quantization api changes the model inplace and we want -# to keep the original model for future comparison - - -model_to_quantize = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model_to_quantize.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') - ) - ) - -model_to_quantize.eval() - - -prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) -print("prepared model:", prepared_model) -quantized_model = convert_fx(prepared_model) -print("quantized model", quantized_model) - - -###################################################################### -# For dynamically quantized objects, we didn't do anything in ``prepare_fx`` for modules, -# but will insert observers for weight for dynamically quantizable forunctionals and torch ops. -# We also fuse the modules like Conv + Bn, Linear + ReLU. -# -# In convert we'll convert the float modules to dynamically quantized modules and -# convert float ops to dynamically quantized ops. We can see in the example model, -# ``nn.Embedding``, ``nn.Linear`` and ``nn.LSTM`` are dynamically quantized. -# -# Now we can compare the size and runtime of the quantized model. - -def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - -print_size_of_model(model) -print_size_of_model(quantized_model) - -###################################################################### -# There is a 4x size reduction because we quantized all the weights -# in the model (nn.Embedding, nn.Linear and nn.LSTM) from float (4 bytes) to quantized int (1 byte). - -torch.set_num_threads(1) - -def time_model_evaluation(model, test_data): - s = time.time() - loss = evaluate(model, test_data) - elapsed = time.time() - s - print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed)) - -time_model_evaluation(model, test_data) -time_model_evaluation(quantized_model, test_data) - -##################################################################### -# There is a roughly 2x speedup for this model. Also note that the speedup -# may vary depending on model, device, build, input batch sizes, threading etc. -# -# 3. Conclusion -# ------------- -# This tutorial introduces the api for post training dynamic quantization in FX Graph Mode, -# which dynamically quantizes the same modules as Eager Mode Quantization. diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst deleted file mode 100644 index a7165f713f8..00000000000 --- a/prototype_source/fx_graph_mode_ptq_static.rst +++ /dev/null @@ -1,411 +0,0 @@ -(prototype) FX Graph Mode Post Training Static Quantization -=========================================================== -**Author**: `Jerry Zhang `_ **Edited by**: `Charles Hernandez `_ - -This tutorial introduces the steps to do post training static quantization in graph mode based on -`torch.fx `_. -The advantage of FX graph mode quantization is that we can perform quantization fully automatically on the model. -Although there might be some effort required to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``), -we'll have a separate tutorial to show how to make the part of the model we want to quantize compatible with FX Graph Mode Quantization. -We also have a tutorial for `FX Graph Mode Post Training Dynamic Quantization `_. -tldr; The FX Graph Mode API looks like the following: - -.. code:: python - - import torch - from torch.ao.quantization import get_default_qconfig - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx - from torch.ao.quantization import QConfigMapping - float_model.eval() - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - example_inputs = (next(iter(data_loader))[0]) # get an example input - prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) # fuse modules and insert observers - calibrate(prepared_model, data_loader_test) # run calibration on sample data - quantized_model = convert_fx(prepared_model) # convert the calibrated model to a quantized model - - - -1. Motivation of FX Graph Mode Quantization -------------------------------------------- - -Currently, PyTorch only has eager mode quantization as an alternative: `Static Quantization with Eager Mode in PyTorch `_. - -We can see there are multiple manual steps involved in the eager mode quantization process, including: - -- Explicitly quantize and dequantize activations-this is time consuming when floating point and quantized operations are mixed in a model. -- Explicitly fuse modules-this requires manually identifying the sequence of convolutions, batch norms and relus and other fusion patterns. -- Special handling is needed for pytorch tensor operations (like add, concat etc.) -- Functionals did not have first class support (functional.conv2d and functional.linear would not get quantized) - -Most of these required modifications comes from the underlying limitations of eager mode quantization. Eager mode works in module level since it can not inspect the code that is actually run (in the forward function), quantization is achieved by module swapping, and we don’t know how the modules are used in forward function in eager mode, so it requires users to insert QuantStub and DeQuantStub manually to mark the points they want to quantize or dequantize. -In graph mode, we can inspect the actual code that’s been executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. Since graph mode has full visibility of the code that is run, our tool is able to automatically figure out things like which modules to fuse and where to insert observer calls, quantize/dequantize functions etc., we are able to automate the whole quantization process. - -Advantages of FX Graph Mode Quantization are: - -- Simple quantization flow, minimal manual steps -- Unlocks the possibility of doing higher level optimizations like automatic precision selection - -2. Define Helper Functions and Prepare Dataset ----------------------------------------------- - -We’ll start by doing the necessary imports, defining some helper functions and prepare the data. -These steps are identitcal to `Static Quantization with Eager Mode in PyTorch `_. - -To run the code in this tutorial using the entire ImageNet dataset, first download imagenet by following the instructions at here `ImageNet Data `_. Unzip the downloaded file into the 'data_path' folder. - -Download the `torchvision resnet18 model `_ and rename it to -``data/resnet18_pretrained_float.pth``. - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - from torch.ao.quantization import get_default_qconfig, QConfigMapping - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - - def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - - def evaluate(model, criterion, data_loader): - model.eval() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file) - model.load_state_dict(state_dict) - model.to("cpu") - return model - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 30 - eval_batch_size = 50 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cpu") - float_model.eval() - - # create another instance of the model since - # we need to keep the original model around - model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu") - -3. Set model to eval mode -------------------------- -For post training quantization, we'll need to set model to eval mode. - -.. code:: python - - model_to_quantize.eval() - - -4. Specify how to quantize the model with ``QConfigMapping`` ------------------------------------------------------------- - -.. code:: python - - qconfig_mapping = QConfigMapping.set_global(default_qconfig) - -We use the same qconfig used in eager mode quantization, ``qconfig`` is just a named tuple -of the observers for activation and weight. ``QConfigMapping`` contains mapping information from ops to qconfigs: - -.. code:: python - - qconfig_mapping = (QConfigMapping() - .set_global(qconfig_opt) # qconfig_opt is an optional qconfig, either a valid qconfig or None - .set_object_type(torch.nn.Conv2d, qconfig_opt) # can be a callable... - .set_object_type("reshape", qconfig_opt) # ...or a string of the method - .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig_opt) # matched in order, first match takes precedence - .set_module_name("foo.bar", qconfig_opt) - .set_module_name_object_type_order() - ) - # priority (in increasing order): global, object_type, module_name_regex, module_name - # qconfig == None means fusion and quantization should be skipped for anything - # matching the rule (unless a higher priority match is found) - - -Utility functions related to ``qconfig`` can be found in the `qconfig `_ file -while those for ``QConfigMapping`` can be found in the `qconfig_mapping ` - -.. code:: python - - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - -5. Prepare the Model for Post Training Static Quantization ----------------------------------------------------------- - -.. code:: python - - prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) - -prepare_fx folds BatchNorm modules into previous Conv2d modules, and insert observers -in appropriate places in the model. - -.. code:: python - - prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) - print(prepared_model.graph) - -6. Calibration --------------- -Calibration function is run after the observers are inserted in the model. -The purpose for calibration is to run through some sample examples that is representative of the workload -(for example a sample of the training data set) so that the observers in the model are able to observe -the statistics of the Tensors and we can later use this information to calculate quantization parameters. - -.. code:: python - - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - calibrate(prepared_model, data_loader_test) # run calibration on sample data - -7. Convert the Model to a Quantized Model ------------------------------------------ -``convert_fx`` takes a calibrated model and produces a quantized model. - -.. code:: python - - quantized_model = convert_fx(prepared_model) - print(quantized_model) - -8. Evaluation -------------- -We can now print the size and accuracy of the quantized model. - -.. code:: python - - print("Size of model before quantization") - print_size_of_model(float_model) - print("Size of model after quantization") - print_size_of_model(quantized_model) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - fx_graph_mode_model_file_path = saved_model_dir + "resnet18_fx_graph_mode_quantized.pth" - - # this does not run due to some erros loading convrelu module: - # ModuleAttributeError: 'ConvReLU2d' object has no attribute '_modules' - # save the whole model directly - # torch.save(quantized_model, fx_graph_mode_model_file_path) - # loaded_quantized_model = torch.load(fx_graph_mode_model_file_path) - - # save with state_dict - # torch.save(quantized_model.state_dict(), fx_graph_mode_model_file_path) - # import copy - # model_to_quantize = copy.deepcopy(float_model) - # prepared_model = prepare_fx(model_to_quantize, {"": qconfig}) - # loaded_quantized_model = convert_fx(prepared_model) - # loaded_quantized_model.load_state_dict(torch.load(fx_graph_mode_model_file_path)) - - # save with script - torch.jit.save(torch.jit.script(quantized_model), fx_graph_mode_model_file_path) - loaded_quantized_model = torch.jit.load(fx_graph_mode_model_file_path) - - top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test) - print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - -If you want to get better accuracy or performance, try changing the `qconfig_mapping`. -We plan to add support for graph mode in the Numerical Suite so that you can -easily determine the sensitivity towards quantization of different modules in a model. For more information, see `PyTorch Numeric Suite Tutorial `_ - -9. Debugging Quantized Model ----------------------------- -We can also print the weight for quantized a non-quantized convolution op to see the difference, -we'll first call fuse explicitly to fuse the convolution and batch norm in the model: -Note that ``fuse_fx`` only works in eval mode. - -.. code:: python - - fused = fuse_fx(float_model) - - conv1_weight_after_fuse = fused.conv1[0].weight[0] - conv1_weight_after_quant = quantized_model.conv1.weight().dequantize()[0] - - print(torch.max(abs(conv1_weight_after_fuse - conv1_weight_after_quant))) - -10. Comparison with Baseline Float Model and Eager Mode Quantization --------------------------------------------------------------------- - -.. code:: python - - scripted_float_model_file = "resnet18_scripted.pth" - - print("Size of baseline model") - print_size_of_model(float_model) - - top1, top5 = evaluate(float_model, criterion, data_loader_test) - print("Baseline Float Model Evaluation accuracy: %2.2f, %2.2f"%(top1.avg, top5.avg)) - torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file) - -In this section, we compare the model quantized with FX graph mode quantization with the model -quantized in eager mode. FX graph mode and eager mode produce very similar quantized models, -so the expectation is that the accuracy and speedup are similar as well. - -.. code:: python - - print("Size of Fx graph mode quantized model") - print_size_of_model(quantized_model) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("FX graph mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - from torchvision.models.quantization.resnet import resnet18 - eager_quantized_model = resnet18(pretrained=True, quantize=True).eval() - print("Size of eager mode quantized model") - eager_quantized_model = torch.jit.script(eager_quantized_model) - print_size_of_model(eager_quantized_model) - top1, top5 = evaluate(eager_quantized_model, criterion, data_loader_test) - print("eager mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - eager_mode_model_file = "resnet18_eager_mode_quantized.pth" - torch.jit.save(eager_quantized_model, saved_model_dir + eager_mode_model_file) - -We can see that the model size and accuracy of FX graph mode and eager mode quantized model are pretty similar. - -Running the model in AIBench (with single threading) gives the following result: - -.. code:: - - Scripted Float Model: - Self CPU time total: 192.48ms - - Scripted Eager Mode Quantized Model: - Self CPU time total: 50.76ms - - Scripted FX Graph Mode Quantized Model: - Self CPU time total: 50.63ms - -As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speedup over the floating point model, -which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary -depending on model, device, build, input batch sizes, threading etc. diff --git a/prototype_source/fx_graph_mode_quant_guide.rst b/prototype_source/fx_graph_mode_quant_guide.rst deleted file mode 100644 index 4ae8496ed52..00000000000 --- a/prototype_source/fx_graph_mode_quant_guide.rst +++ /dev/null @@ -1,324 +0,0 @@ -(prototype) FX Graph Mode Quantization User Guide -=========================================================== - -**Author**: `Jerry Zhang `_ - -FX Graph Mode Quantization requires a symbolically traceable model. -We use the FX framework to convert a symbolically traceable nn.Module instance to IR, -and we operate on the IR to execute the quantization passes. -Please post your question about symbolically tracing your model in `PyTorch Discussion Forum `_ - -Quantization will only work on the symbolically traceable parts of your model. -The data dependent control flow-if statements / for loops, and so on using symbolically traced values-are one common pattern which is not supported. -If your model is not symbolically traceable end to end, you have a couple of options to enable FX Graph Mode Quantization only on a part of the model. -You can use any combination of these options: - -1. Non traceable code doesn’t need to be quantized - a. Symbolically trace only the code that needs to be quantized - b. Skip symbolic tracing the non-traceable code - -2. Non traceable code needs to be quantized - a. Refactor your code to make it symbolically traceable - b. Write your own observed and quantized submodule - - -If the code that is not symbolically traceable does not need to be quantized, we have the following two options -to run FX Graph Mode Quantization: - - -Symbolically trace only the code that needs to be quantized ------------------------------------------------------------------ -When the whole model is not symbolically traceable but the submodule we want to quantize is -symbolically traceable, we can run quantization only on that submodule. - -before: - -.. code:: python - - class M(nn.Module): - def forward(self, x): - x = non_traceable_code_1(x) - x = traceable_code(x) - x = non_traceable_code_2(x) - return x - -after: - -.. code:: python - - class FP32Traceable(nn.Module): - def forward(self, x): - x = traceable_code(x) - return x - - class M(nn.Module): - def __init__(self): - self.traceable_submodule = FP32Traceable(...) - def forward(self, x): - x = self.traceable_code_1(x) - # We'll only symbolic trace/quantize this submodule - x = self.traceable_submodule(x) - x = self.traceable_code_2(x) - return x - -quantization code: - -.. code:: python - - qconfig_mapping = QConfigMapping().set_global(qconfig) - model_fp32.traceable_submodule = \ - prepare_fx(model_fp32.traceable_submodule, qconfig_mapping, example_inputs) - -Note if original model needs to be preserved, you will have to -copy it yourself before calling the quantization APIs. - - -Skip symbolically trace the non-traceable code ---------------------------------------------------- -When we have some non-traceable code in the module, and this part of code doesn’t need to be quantized, -we can factor out this part of the code into a submodule and skip symbolically trace that submodule. - - -before - -.. code:: python - - class M(nn.Module): - - def forward(self, x): - x = self.traceable_code_1(x) - x = non_traceable_code(x) - x = self.traceable_code_2(x) - return x - - -after, non-traceable parts moved to a module and marked as a leaf - -.. code:: python - - class FP32NonTraceable(nn.Module): - - def forward(self, x): - x = non_traceable_code(x) - return x - - class M(nn.Module): - - def __init__(self): - ... - self.non_traceable_submodule = FP32NonTraceable(...) - - def forward(self, x): - x = self.traceable_code_1(x) - # we will configure the quantization call to not trace through - # this submodule - x = self.non_traceable_submodule(x) - x = self.traceable_code_2(x) - return x - -quantization code: - -.. code:: python - - qconfig_mapping = QConfigMapping.set_global(qconfig) - - prepare_custom_config_dict = { - # option 1 - "non_traceable_module_name": "non_traceable_submodule", - # option 2 - "non_traceable_module_class": [MNonTraceable], - } - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict, - ) - -If the code that is not symbolically traceable needs to be quantized, we have the following two options: - -Refactor your code to make it symbolically traceable --------------------------------------------------------- -If it is easy to refactor the code and make the code symbolically traceable, -we can refactor the code and remove the use of non-traceable constructs in python. - -More information about symbolic tracing support can be found `here `_. - -before: - -.. code:: python - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - -This is not symbolically traceable because in x.view(*new_x_shape) -unpacking is not supported, however, it is easy to remove the unpacking -since x.view also supports list input. - - -after: - -.. code:: python - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - - -This can be combined with other approaches and the quantization code -depends on the model. - -Write your own observed and quantized submodule ------------------------------------------------------ - -If the non-traceable code can’t be refactored to be symbolically traceable, -for example it has some loops that can’t be eliminated, like nn.LSTM, -we’ll need to factor out the non-traceable code to a submodule (we call it CustomModule in fx graph mode quantization) and -define the observed and quantized version of the submodule (in post training static quantization or quantization aware training for static quantization) -or define the quantized version (in post training dynamic and weight only quantization) - - -before: - -.. code:: python - - class M(nn.Module): - - def forward(self, x): - x = traceable_code_1(x) - x = non_traceable_code(x) - x = traceable_code_1(x) - return x - -after: - -1. Factor out non_traceable_code to FP32NonTraceable -non-traceable logic, wrapped in a module - -.. code:: python - - class FP32NonTraceable: - ... - -2. Define observed version of -FP32NonTraceable - -.. code:: python - - class ObservedNonTraceable: - - @classmethod - def from_float(cls, ...): - ... - -3. Define statically quantized version of FP32NonTraceable -and a class method "from_observed" to convert from ObservedNonTraceable -to StaticQuantNonTraceable - -.. code:: python - - class StaticQuantNonTraceable: - - @classmethod - def from_observed(cls, ...): - ... - - -.. code:: python - - # refactor parent class to call FP32NonTraceable - class M(nn.Module): - - def __init__(self): - ... - self.non_traceable_submodule = FP32NonTraceable(...) - - def forward(self, x): - x = self.traceable_code_1(x) - # this part will be quantized manually - x = self.non_traceable_submodule(x) - x = self.traceable_code_1(x) - return x - - -quantization code: - - -.. code:: python - - # post training static quantization or - # quantization aware training (that produces a statically quantized module)v - prepare_custom_config_dict = { - "float_to_observed_custom_module_class": { - "static": { - FP32NonTraceable: ObservedNonTraceable, - } - }, - } - - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict) - -calibrate / train (not shown) - -.. code:: python - - convert_custom_config_dict = { - "observed_to_quantized_custom_module_class": { - "static": { - ObservedNonTraceable: StaticQuantNonTraceable, - } - }, - } - model_quantized = convert_fx( - model_prepared, - convert_custom_config_dict) - -post training dynamic/weight only quantization -in these two modes we don't need to observe the original model, so we -only need to define thee quantized model - -.. code:: python - - class DynamicQuantNonTraceable: # or WeightOnlyQuantMNonTraceable - ... - @classmethod - def from_observed(cls, ...): - ... - - prepare_custom_config_dict = { - "non_traceable_module_class": [ - FP32NonTraceable - ] - } - - -.. code:: python - - # The example is for post training quantization - model_fp32.eval() - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict) - - convert_custom_config_dict = { - "observed_to_quantized_custom_module_class": { - "dynamic": { - FP32NonTraceable: DynamicQuantNonTraceable, - } - }, - } - model_quantized = convert_fx( - model_prepared, - convert_custom_config_dict) - -You can also find examples for custom modules in test ``test_custom_module_class`` in ``torch/test/quantization/test_quantize_fx.py``. diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst deleted file mode 100644 index 949002a55dc..00000000000 --- a/prototype_source/graph_mode_dynamic_bert_tutorial.rst +++ /dev/null @@ -1,544 +0,0 @@ -(prototype) Graph Mode Dynamic Quantization on BERT -=================================================== - - -**Author**: `Supriya Rao `_ - -Introduction ------------- - -This tutorial introduces the steps to do post training Dynamic Quantization with Graph Mode Quantization. Dynamic quantization converts a float model to a quantized model with static int8 data types for the weights and dynamic quantization for the activations. The activations are quantized dynamically (per batch) to int8 while the weights are statically quantized to int8. Graph Mode Quantization flow operates on the model graph and requires minimal user intervention to quantize the model. To be able to use graph mode, the float model needs to be either traced or scripted first. - -Advantages of graph mode quantization are: - -- In graph mode, we can inspect the code that is executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. -- Simple quantization flow, minimal manual steps. -- Unlocks the possibility of doing higher level optimizations like automatic precision selection. - -For additional details on Graph Mode Quantization please refer to the `Graph Mode Static Quantization Tutorial `_. - -tl;dr The Graph Mode Dynamic `Quantization API `_: - -.. code:: python - - import torch - from torch.quantization import per_channel_dynamic_qconfig - from torch.quantization import quantize_dynamic_jit - - ts_model = torch.jit.script(float_model) # or torch.jit.trace(float_model, input) - - quantized = quantize_dynamic_jit(ts_model, {'': per_channel_dynamic_qconfig}) - -1. Quantizing BERT Model ------------------------- - -The installaion steps and details about the model are identical to the steps in the Eager Mode Tutorial. Please refer to the tutorial `here `_ for more details. - -1.1 Setup -^^^^^^^^^ -Once all the necesessary packages are downloaded and installed we setup the code. We first start with the necessary imports and setup for the model. - -.. code:: python - - import logging - import numpy as np - import os - import random - import sys - import time - import torch - - from argparse import Namespace - from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) - from tqdm import tqdm - from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,) - from transformers import glue_compute_metrics as compute_metrics - from transformers import glue_output_modes as output_modes - from transformers import glue_processors as processors - from transformers import glue_convert_examples_to_features as convert_examples_to_features - from torch.quantization import per_channel_dynamic_qconfig - from torch.quantization import quantize_dynamic_jit - - def ids_tensor(shape, vocab_size): - # Creates a random int32 tensor of the shape within the vocab size - return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu') - - # Setup logging - logger = logging.getLogger(__name__) - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) - - logging.getLogger("transformers.modeling_utils").setLevel( - logging.WARN) # Reduce logging - - print(torch.__version__) - - torch.set_num_threads(1) - print(torch.__config__.parallel_info()) - -1.2 Download GLUE dataset -^^^^^^^^^^^^^^^^^^^^^^^^^ -Before running MRPC tasks we download the GLUE data by running this script and unpack it to a directory glue_data. - -.. code:: shell - - python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' - -1.3 Set global BERT configurations -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To run this experiment we first need a fine tuned BERT model. We provide the fined-tuned BERT model for MRPC task `here `_. To save time, you can download the model file (~400 MB) directly into your local folder $OUT_DIR. - - -.. code:: python - - configs = Namespace() - - # The output directory for the fine-tuned model, $OUT_DIR. - configs.output_dir = "./MRPC/" - - # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME. - configs.data_dir = "./glue_data/MRPC" - - # The model name or path for the pre-trained model. - configs.model_name_or_path = "bert-base-uncased" - # The maximum length of an input sequence - configs.max_seq_length = 128 - - # Prepare GLUE task. - configs.task_name = "MRPC".lower() - configs.processor = processors[configs.task_name]() - configs.output_mode = output_modes[configs.task_name] - configs.label_list = configs.processor.get_labels() - configs.model_type = "bert".lower() - configs.do_lower_case = True - - # Set the device, batch size, topology, and caching flags. - configs.device = "cpu" - configs.per_gpu_eval_batch_size = 8 - configs.n_gpu = 0 - configs.local_rank = -1 - configs.overwrite_cache = False - - # Set random seed for reproducibility. - def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - set_seed(42) - - tokenizer = BertTokenizer.from_pretrained( - configs.output_dir, do_lower_case=configs.do_lower_case) - - model = BertForSequenceClassification.from_pretrained(configs.output_dir, torchscript=True) - model.to(configs.device) - -1.4 Quantizing BERT model with Graph Mode Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1.4.1 Script/Trace the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The input for graph mode quantization is a TorchScript model, so you'll need to either script or trace the model first. Currently, scripting the BERT model is not supported so we trace the model here. - -We first identify the inputs to be passed to the model. Here, we trace the model with the largest possible input size that will be passed during the evaluation. -We choose a batch size of 8 and sequence lenght of 128 based on the input sizes passed in during the evaluation step below. Using the max possible shape during inference while tracing is a limitation of the huggingface BERT model as mentioned `here `_. - -We trace the model using ``torch.jit.trace``. - -.. code:: python - - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - dummy_input = (input_ids, attention_mask, token_type_ids) - traced_model = torch.jit.trace(model, dummy_input) - -1.4.2 Specify qconfig_dict -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code:: - - qconfig_dict = {'': per_channel_dynamic_qconfig} - -qconfig is a named tuple of the observers for activation and weight. For dynamic quantization we use a dummy activation observer to mimic the dynamic quantization process that happens in the operator during runtime. For the weight tensors we recommend using per-channel quantization which helps improve the final accuracy. -``qconfig_dict`` is a dictionary with names of sub modules as key and qconfig for that module as value, empty key means the qconfig will be applied to whole model unless it’s overwritten by more specific configurations, the qconfig for each module is either found in the dictionary or fallback to the qconfig of parent module. - -Right now qconfig_dict is the only way to configure how the model is quantized, and it is done in the granularity of module, that is, we only support one type of qconfig for each module, and the qconfig for sub module will override the qconfig for parent module. For example, if we have - -.. code:: - - qconfig = { - '' : qconfig_global, - 'sub' : qconfig_sub, - 'sub.fc1' : qconfig_fc, - 'sub.fc2': None - } - -Module ``sub.fc1`` will be configured with ``qconfig_fc``, and all other child modules in ``sub`` will be configured with ``qconfig_sub`` and ``sub.fc2`` will not be quantized. All other modules in the model will be quantized with qconfig_global - -.. code:: python - - qconfig_dict = {'': per_channel_dynamic_qconfig} - -1.4.3 Quantize the model (one-line API) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We call the one line API (similar to eager mode) to perform quantization as follows. - -.. code:: python - - quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict) - -2. Evaluation -------------- - -We reuse the tokenize and evaluation function from Huggingface. - -.. code:: python - - def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1]} - labels = batch[3] - if args.model_type != 'distilbert': - inputs['input'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - logits = outputs[0] - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = labels.detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0) - - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode,) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - def time_model_evaluation(model, configs, tokenizer): - eval_start_time = time.time() - result = evaluate(configs, model, tokenizer, prefix="") - eval_end_time = time.time() - eval_duration_time = eval_end_time - eval_start_time - print(result) - print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time)) - - -2.1 Check Model Size -^^^^^^^^^^^^^^^^^^^^ - -We print the model size to account for wins from quantization - -.. code:: python - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - - print("Size of model before quantization") - print_size_of_model(traced_model) - print("Size of model after quantization") - - print_size_of_model(quantized_model) - -.. code:: - - Size of model before quantization - Size (MB): 438.242141 - Size of model after quantization - Size (MB): 184.354759 - -2.2 Run the evaluation -^^^^^^^^^^^^^^^^^^^^^^ -We evaluate the FP32 and quantized model and compare the F1 score. Note that the performance numbers below are on a dev machine and they would likely improve on a production server. - -.. code:: python - - time_model_evaluation(traced_model, configs, tokenizer) - time_model_evaluation(quantized_model, configs, tokenizer) - -.. code:: - - FP32 model results - - 'f1': 0.901 - Time taken - 188.0s - - INT8 model results - - 'f1': 0.902 - Time taken - 157.4s - -3. Debugging the Quantized Model --------------------------------- - -We can debug the quantized model by passing in the debug option. - -.. code:: - - quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True) - -If debug is set to True: - -- We can access the attributes of the quantized model the same way as in a torchscript model, e.g. model.fc1.weight (might be harder if you use a module list or sequential). -- The arithmetic operations all occur in floating point with the numerics being identical to the final quantized model, allowing for debugging. - -.. code:: python - - quantized_model_debug = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True) - -Calling ``quantize_dynamic_jit`` is equivalent to calling ``prepare_dynamic_jit`` followed by ``convert_dynamic_jit``. Usage of the one-line API is recommended. But if you wish to debug or analyze the model after each step, the multi-line API comes into use. - -3.1. Evaluate the Debug Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code:: python - - # Evaluate the debug model - time_model_evaluation(quantized_model_debug, configs, tokenizer) - -.. code:: - - Size (MB): 438.406429 - - INT8 (debug=True) model results - - 'f1': 0.897 - -Note that the accuracy of the debug version is close to, but not exactly the same as the non-debug version as the debug version uses floating point ops to emulate quantized ops and the numerics match is approximate. -This is the case only for per-channel quantization (we are working on improving this). Per-tensor quantization (using default_dynamic_qconfig) has exact numerics match between debug and non-debug version. - -.. code:: python - - print(str(quantized_model_debug.graph)) - -Snippet of the graph printed - - -.. code:: - - %111 : Tensor = prim::GetAttr[name="bias"](%110) - %112 : Tensor = prim::GetAttr[name="weight"](%110) - %113 : Float(768:1) = prim::GetAttr[name="4_scale_0"](%110) - %114 : Int(768:1) = prim::GetAttr[name="4_zero_point_0"](%110) - %115 : int = prim::GetAttr[name="4_axis_0"](%110) - %116 : int = prim::GetAttr[name="4_scalar_type_0"](%110) - %4.quant.6 : Tensor = aten::quantize_per_channel(%112, %113, %114, %115, %116) - %4.dequant.6 : Tensor = aten::dequantize(%4.quant.6) - %1640 : bool = prim::Constant[value=1]() - %input.5.scale.1 : float, %input.5.zero_point.1 : int = aten::_choose_qparams_per_tensor(%input.5, %1640) - %input.5.quant.1 : Tensor = aten::quantize_per_tensor(%input.5, %input.5.scale.1, %input.5.zero_point.1, %74) - %input.5.dequant.1 : Float(8:98304, 128:768, 768:1) = aten::dequantize(%input.5.quant.1) - %119 : Tensor = aten::linear(%input.5.dequant.1, %4.dequant.6, %111) - -We can see that there is no ``quantized::linear_dynamic`` in the model, but the numerically equivalent pattern of ``aten::_choose_qparams_per_tensor`` - ``aten::quantize_per_tensor`` - ``aten::dequantize`` - ``aten::linear``. - -.. code:: python - - # Get the size of the debug model - print_size_of_model(quantized_model_debug) - -.. code:: - - Size (MB): 438.406429 - -Size of the debug model is the close to the floating point model because all the weights are in float and not yet quantized and frozen, this allows people to inspect the weight. -You may access the weight attributes directly in the torchscript model. Accessing the weight in the debug model is the same as accessing the weight in a TorchScript model: - -.. code:: python - - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.weight) - -.. code:: - - tensor([[-0.0157, 0.0257, -0.0269, ..., 0.0158, 0.0764, 0.0548], - [-0.0325, 0.0345, -0.0423, ..., -0.0528, 0.1382, 0.0069], - [ 0.0106, 0.0335, 0.0113, ..., -0.0275, 0.0253, -0.0457], - ..., - [-0.0090, 0.0512, 0.0555, ..., 0.0277, 0.0543, -0.0539], - [-0.0195, 0.0943, 0.0619, ..., -0.1040, 0.0598, 0.0465], - [ 0.0009, -0.0949, 0.0097, ..., -0.0183, -0.0511, -0.0085]], - grad_fn=) - -Accessing the scale and zero_point for the corresponding weight can be done as follows - - -.. code:: python - - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_scale_0')) - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_zero_point_0')) - -Since we use per-channel quantization, we get per-channel scales tensor. - -.. code:: - - tensor([0.0009, 0.0011, 0.0010, 0.0011, 0.0034, 0.0013, 0.0010, 0.0010, 0.0013, - 0.0012, 0.0011, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0009, 0.0015, - 0.0016, 0.0036, 0.0012, 0.0009, 0.0010, 0.0014, 0.0008, 0.0008, 0.0008, - ..., - 0.0019, 0.0023, 0.0013, 0.0018, 0.0012, 0.0031, 0.0015, 0.0013, 0.0014, - 0.0022, 0.0011, 0.0024]) - -Zero-point tensor - - -.. code:: - - tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - .., - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - dtype=torch.int32) - -4. Comparing Results with Eager Mode ------------------------------------- - -Following results show the F1 score and model size for Eager Mode Quantization of the same model by following the steps mentioned in the `tutorial `_. Results show that Eager and Graph Mode Quantization on the model produce identical results. - -.. code:: - - FP32 model results - - Size (MB): 438.016605 - 'f1': 0.901 - - INT8 model results - - Size (MB): 182.878029 - 'f1': 0.902 - -5. Benchmarking the Model -------------------------- - -We benchmark the model with dummy input and compare the Float model with Eager and Graph Mode Quantized Model on a production server machine. - -.. code:: python - - def benchmark(model): - model = torch.jit.load(model) - model.eval() - torch.set_num_threads(1) - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - elapsed = 0 - for _i in range(50): - start = time.time() - output = model(input_ids, token_type_ids, attention_mask) - end = time.time() - elapsed = elapsed + (end - start) - print('Elapsed time: ', (elapsed / 50), ' s') - return - print("Running benchmark for Float model") - benchmark(args.jit_model_path_float) - print("Running benchmark for Eager Mode Quantized model") - benchmark(args.jit_model_path_eager) - print("Running benchmark for Graph Mode Quantized model") - benchmark(args.jit_model_path_graph) - -.. code:: - - Running benchmark for Float model - Elapsed time: 4.49 s - Running benchmark for Eager Mode Quantized model - Elapsed time: 2.67 s - Running benchmark for Graph Mode Quantized model - Elapsed time: 2.69 s - As we can see both graph mode and eager mode quantized model have a similar speed up over the floating point model. - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a well-known state-of-the-art NLP model like BERT into dynamic quantized model using graph mode with same performance as eager mode. -Dynamic quantization can reduce the size of the model while only having a limited implication on accuracy. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. diff --git a/prototype_source/inductor_cpp_wrapper_tutorial.rst b/prototype_source/inductor_cpp_wrapper_tutorial.rst deleted file mode 100644 index 4bcc9009075..00000000000 --- a/prototype_source/inductor_cpp_wrapper_tutorial.rst +++ /dev/null @@ -1,159 +0,0 @@ -Inductor C++ Wrapper Tutorial -============================================================== - -**Author**: `Chunyuan Wu `_, `Bin Bao `__, `Jiong Gong `__ - -Prerequisites: ----------------- -- `torch.compile and TorchInductor concepts in PyTorch `__ - -Introduction ------------- - -Python, as the primary interface of PyTorch, is easy to use and efficient for development and debugging. -The Inductor's default wrapper generates Python code to invoke generated kernels and external kernels. -However, in deployments requiring high performance, Python, as an interpreted language, runs relatively slower compared to compiled languages. - -We implemented an Inductor C++ wrapper by leveraging the PyTorch C++ APIs -to generate pure C++ code that combines the generated and external kernels. -This allows for the execution of each captured Dynamo graph in pure C++, -thereby reducing the Python overhead within the graph. - - -Enabling the API ----------------- -This feature is still in prototype stage. To activate this feature, add the following to your code: - -.. code:: python - - import torch._inductor.config as config - config.cpp_wrapper = True - -This will speed up your models by reducing the Python overhead of the Inductor wrapper. - - -Example code ------------- - -We will use the below frontend code as an example: - -.. code:: python - - import torch - - def fn(x): - return torch.tensor(list(range(2, 40, 2)), device=x.device) + x - - x = torch.randn(1) - opt_fn = torch.compile()(fn) - y = opt_fn(x) - - -**For CPU** - -The main part of Inductor-generated code with the default Python wrapper will look like this: - -.. code:: python - - def call(args): - arg0_1, = args - args.clear() - assert_size_stride(arg0_1, (1, ), (1, )) - buf0 = empty_strided((19, ), (1, ), device='cpu', dtype=torch.float32) - cpp_fused_add_lift_fresh_0(c_void_p(constant0.data_ptr()), c_void_p(arg0_1.data_ptr()), c_void_p(buf0.data_ptr())) - del arg0_1 - return (buf0, ) - -By turning on the C++ wrapper, the generated code for the ``call`` function becomes a C++ function -``inductor_entry_cpp`` of the C++ extension ``module``: - -.. code:: python - - std::vector inductor_entry_cpp(const std::vector& args) { - at::Tensor arg0_1 = args[0]; - at::Tensor constant0 = args[1]; - auto buf0 = at::empty_strided({19L, }, {1L, }, at::device(at::kCPU).dtype(at::kFloat)); - cpp_fused_add_lift_fresh_0((long*)(constant0.data_ptr()), (float*)(arg0_1.data_ptr()), (float*)(buf0.data_ptr())); - arg0_1.reset(); - return {buf0}; - } - - module = CppWrapperCodeCache.load(cpp_wrapper_src, 'inductor_entry_cpp', 'c2buojsvlqbywxe3itb43hldieh4jqulk72iswa2awalwev7hjn2', False) - - def _wrap_func(f): - def g(args): - args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args] - constants_tensor = [constant0] - args_tensor.extend(constants_tensor) - - return f(args_tensor) - return g - call = _wrap_func(module.inductor_entry_cpp) - -**For GPU** - -Based on the same example code, the generated code for GPU will look like this: - -.. code:: python - - def call(args): - arg0_1, = args - args.clear() - assert_size_stride(arg0_1, (1, ), (1, )) - with torch.cuda._DeviceGuard(0): - torch.cuda.set_device(0) # no-op to ensure context - buf0 = empty_strided((19, ), (1, ), device='cuda', dtype=torch.float32) - # Source Nodes: [add, tensor], Original ATen: [aten.add, aten.lift_fresh] - stream0 = get_cuda_stream(0) - triton_poi_fused_add_lift_fresh_0.run(constant0, arg0_1, buf0, 19, grid=grid(19), stream=stream0) - run_intermediate_hooks('add', buf0) - del arg0_1 - return (buf0, ) - -With the C++ wrapper turned on, the below equivalent C++ code will be generated: - -.. code:: python - - std::vector inductor_entry_cpp(const std::vector& args) { - at::Tensor arg0_1 = args[0]; - at::Tensor constant0 = args[1]; - - at::cuda::CUDAGuard device_guard(0); - auto buf0 = at::empty_strided({19L, }, {1L, }, at::TensorOptions(c10::Device(at::kCUDA, 0)).dtype(at::kFloat)); - // Source Nodes: [add, tensor], Original ATen: [aten.add, aten.lift_fresh] - if (triton_poi_fused_add_lift_fresh_0 == nullptr) { - triton_poi_fused_add_lift_fresh_0 = loadKernel("/tmp/torchinductor_user/mm/cmm6xjgijjffxjku4akv55eyzibirvw6bti6uqmfnruujm5cvvmw.cubin", "triton_poi_fused_add_lift_fresh_0_0d1d2d3"); - } - CUdeviceptr var_0 = reinterpret_cast(constant0.data_ptr()); - CUdeviceptr var_1 = reinterpret_cast(arg0_1.data_ptr()); - CUdeviceptr var_2 = reinterpret_cast(buf0.data_ptr()); - auto var_3 = 19; - void* kernel_args_var_0[] = {&var_0, &var_1, &var_2, &var_3}; - cudaStream_t stream0 = at::cuda::getCurrentCUDAStream(0); - launchKernel(triton_poi_fused_add_lift_fresh_0, 1, 1, 1, 1, 0, kernel_args_var_0, stream0); - arg0_1.reset(); - return {buf0}; - } - - module = CppWrapperCodeCache.load(cpp_wrapper_src, 'inductor_entry_cpp', 'czbpeilh4qqmbyejdgsbpdfuk2ss5jigl2qjb7xs4gearrjvuwem', True) - - def _wrap_func(f): - def g(args): - args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args] - constants_tensor = [constant0] - args_tensor.extend(constants_tensor) - - return f(args_tensor) - return g - call = _wrap_func(module.inductor_entry_cpp) - - -Conclusion ------------- - -In this tutorial, we introduced a new C++ wrapper in TorchInductor to speed up your models with just two lines of code changes. -We explained the motivation of this new feature and walked through the easy-to-use API to activate this experimental feature. -Furthermore, we demonstrated the Inductor-generated code using the default Python wrapper and the new C++ wrapper on both CPU and GPU -to visually showcase the difference between these two wrappers. - -This feature is still in prototype stage. If you have any feature requests or run into any issues, please file a bug report at `GitHub issues `_. diff --git a/prototype_source/ios_coreml_workflow.rst b/prototype_source/ios_coreml_workflow.rst deleted file mode 100644 index bfaccd77a10..00000000000 --- a/prototype_source/ios_coreml_workflow.rst +++ /dev/null @@ -1,128 +0,0 @@ -(Prototype) Convert Mobilenetv2 to Core ML -========================================== - -**Author**: `Tao Xu `_ - -Introduction ------------- - -Core ML provides access to powerful and efficient NPUs(Neural Process Unit) on modern iPhone devices. This tutorial shows how to prepare a computer vision model (mobilenetv2) to use the PyTorch Core ML mobile backend. - -Note that this feature is currently in the “prototype” phase and only supports a limited numbers of operators, but we expect to solidify the integration and expand our operator support over time. The APIs are subject to change in the future. - -Environment Setup (MacOS) -------------------------- - -Let's start off by creating a new conda environment. - -.. code:: shell - - conda create --name 1.10 python=3.8 --yes - conda activate 1.10 - -Next, since the Core ML delegate is a prototype feature, let's install the PyTorch nightly build and coremltools - -.. code:: shell - - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - - pip3 install coremltools==5.0b5 protobuf==3.20.1 - - -Model Preparation -------------------- - -To convert a pre-trained mobilenetv2 model to be Core ML compatible, we're going to use the ``to_backend()`` API, which is a prototype feature for delegating model executions to some specific backends. The following python code shows how to use it to convert the mobilenetv2 torchscript model. - -.. code:: python - - import torch - import torchvision - - from torch.backends._coreml.preprocess import ( - CompileSpec, - TensorSpec, - CoreMLComputeUnit, - ) - - def mobilenetv2_spec(): - return { - "forward": CompileSpec( - inputs=( - TensorSpec( - shape=[1, 3, 224, 224], - ), - ), - outputs=( - TensorSpec( - shape=[1, 1000], - ), - ), - backend=CoreMLComputeUnit.ALL, - allow_low_precision=True, - ), - } - - - def main(): - model = torchvision.models.mobilenet_v2(pretrained=True) - model.eval() - example = torch.rand(1, 3, 224, 224) - model = torch.jit.trace(model, example) - compile_spec = mobilenetv2_spec() - mlmodel = torch._C._jit_to_backend("coreml", model, compile_spec) - mlmodel._save_for_lite_interpreter("./mobilenetv2_coreml.ptl") - - - if __name__ == "__main__": - main() - - -First, we need to call ``.eval()`` to set the model to inference mode. Secondly, we defined a ``mobilenetv2_spec()`` function to tell Core ML what the model looks like. Note that the ``CoreMLComputeUnit`` corresponds to `Apple's processing unit `_ whose value can be ``CPU``, ``CPUAndGPU`` and ``ALL``. In our example, we set the ``backend`` type to ``ALL`` which means Core ML will try to run the model on Neural Engine. Finally, we called the ``to_backend`` API to convert the torchscript model to a Core ML compatible model and save it to the disk. - -Run the python script. If everything works well, you should see following outputs from coremltools - -.. code:: shell - - Converting Frontend ==> MIL Ops: 100%|███████████████████████████████████████████████████████████████████████████████▊| 384/385 [00:00<00:00, 1496.98 ops/s] - Running MIL Common passes: 0%| - 0/33 [00:00 NeuralNetwork Ops: 100%|██████████████████████████████████████████████████████████████████████████| 495/495 [00:00<00:00, 1977.15 ops/s] - [W backend_detail.cpp:376] Warning: Backend [coreml] is not available. Execution of this Module is still possible by saving and loading on a device where the backend is available. (function codegen_backend_module) - -We can safely ignore the warning above, as we don't plan to run our model on desktop. - -iOS app integration ---------------------- - -Now that the model is ready, we can integrate it to our app. We'll be using the pytorch nightly cocoapods which contains the code for executing the Core ML model. Simply add the following code to your Podfile - -.. code:: shell - - pod LibTorch-Lite-Nightly - -In this tutorial, we'll be reusing our `HelloWorld `_ project. Feel free to walk through the code there. - -To benchmark the latency, you can simply put the following code before and after the PyTorch ``forward`` function - -.. code:: objective-c - - caffe2::Timer t; - auto outputTensor = _impl.forward({tensor}).toTensor().cpu(); - std::cout << "forward took: " << t.MilliSeconds() << std::endl; - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a mobilenetv2 model to a Core ML compatible model. Please be aware of that Core ML feature is still under development, new operators/models will continue to be added. APIs are subject to change in the future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. - -Learn More ----------- - -- The `Mobilenetv2 `_ from Torchvision -- Information about `Core ML `_ diff --git a/prototype_source/ios_gpu_workflow.rst b/prototype_source/ios_gpu_workflow.rst deleted file mode 100644 index cb7a0034b23..00000000000 --- a/prototype_source/ios_gpu_workflow.rst +++ /dev/null @@ -1,142 +0,0 @@ -(Prototype) Use iOS GPU in PyTorch -================================== - -**Author**: `Tao Xu `_ - -Introduction ------------- - -This tutorial introduces the steps to run your models on iOS GPU. We'll be using the mobilenetv2 model as an example. Since the mobile GPU features are currently in the prototype stage, you'll need to build a custom pytorch binary from source. For the time being, only a limited number of operators are supported, and certain client side APIs are subject to change in the future versions. - -Model Preparation -------------------- - -Since GPUs consume weights in a different order, the first step we need to do is to convert our TorchScript model to a GPU compatible model. This step is also known as "prepacking". - -PyTorch with Metal -^^^^^^^^^^^^^^^^^^ -To do that, we'll install a pytorch nightly binary that includes the Metal backend. Go ahead run the command below - -.. code:: shell - - conda install pytorch -c pytorch-nightly - // or - pip3 install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - -Also, you can build a custom pytorch binary from source that includes the Metal backend. Just checkout the pytorch source code from github and run the command below - -.. code:: shell - - cd PYTORCH_ROOT - USE_PYTORCH_METAL_EXPORT=ON python setup.py install --cmake - -The command above will build a custom pytorch binary from master. The ``install`` argument simply tells ``setup.py`` to override the existing PyTorch on your desktop. Once the build finished, open another terminal to check the PyTorch version to see if the installation was successful. As the time of writing of this recipe, the version is ``1.8.0a0+41237a4``. You might be seeing different numbers depending on when you check out the code from master, but it should be greater than 1.7.0. - -.. code:: python - - import torch - torch.__version__ #1.8.0a0+41237a4 - -Metal Compatible Model -^^^^^^^^^^^^^^^^^^^^^^ - -The next step is going to be converting the mobilenetv2 torchscript model to a Metal compatible model. We'll be leveraging the ``optimize_for_mobile`` API from the ``torch.utils`` module. As shown below - -.. code:: python - - import torch - import torchvision - from torch.utils.mobile_optimizer import optimize_for_mobile - - model = torchvision.models.mobilenet_v2(pretrained=True) - scripted_model = torch.jit.script(model) - optimized_model = optimize_for_mobile(scripted_model, backend='metal') - print(torch.jit.export_opnames(optimized_model)) - optimized_model._save_for_lite_interpreter('./mobilenetv2_metal.pt') - -Note that the ``torch.jit.export_opnames(optimized_model)`` is going to dump all the optimized operators from the ``optimized_mobile``. If everything works well, you should be able to see the following ops being printed out from the console - - -.. code:: shell - - ['aten::adaptive_avg_pool2d', - 'aten::add.Tensor', - 'aten::addmm', - 'aten::reshape', - 'aten::size.int', - 'metal::copy_to_host', - 'metal_prepack::conv2d_run'] - -Those are all the ops we need to run the mobilenetv2 model on iOS GPU. Cool! Now that you have the ``mobilenetv2_metal.pt`` saved on your disk, let's move on to the iOS part. - - -Use PyTorch iOS library with Metal ----------------------------------- -The PyTorch iOS library with Metal support ``LibTorch-Lite-Nightly`` is available in Cocoapods. You can read the `Using the Nightly PyTorch iOS Libraries in CocoaPods `_ section from the iOS tutorial for more detail about its usage. - -We also have the `HelloWorld-Metal example `_ that shows how to conect all pieces together. - -Note that if you run the HelloWorld-Metal example, you may notice that the results are slighly different from the `results `_ we got from the CPU model as shown in the iOS tutorial. - -.. code:: shell - - - timber wolf, grey wolf, gray wolf, Canis lupus - - malamute, malemute, Alaskan malamute - - Eskimo dog, husky - -This is because by default Metal uses fp16 rather than fp32 to compute. The precision loss is expected. - - -Use LibTorch-Lite Built from Source ------------------------------------ - -You can also build a custom LibTorch-Lite from Source and use it to run GPU models on iOS Metal. In this section, we'll be using the `HelloWorld example `_ to demonstrate this process. - -First, make sure you have deleted the **build** folder from the "Model Preparation" step in PyTorch root directory. Then run the command below - -.. code:: shell - - IOS_ARCH=arm64 USE_PYTORCH_METAL=1 ./scripts/build_ios.sh - -Note ``IOS_ARCH`` tells the script to build a arm64 version of Libtorch-Lite. This is because in PyTorch, Metal is only available for the iOS devices that support the Apple A9 chip or above. Once the build finished, follow the `Build PyTorch iOS libraries from source `_ section from the iOS tutorial to setup the XCode settings properly. Don't forget to copy the ``./mobilenetv2_metal.pt`` to your XCode project and modify the model file path accordingly. - -Next we need to make some changes in ``TorchModule.mm`` - -.. code:: objective-c - - ... - // #import - // If it's built from source with Xcode, comment out the line above - // and use following headers - #include - #include - #include - ... - - - (NSArray*)predictImage:(void*)imageBuffer { - c10::InferenceMode mode; - at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, 224, 224}, at::kFloat).metal(); - auto outputTensor = _impl.forward({tensor}).toTensor().cpu(); - ... - } - ... - -As you can see, we simply just call ``.metal()`` to move our input tensor from CPU to GPU, and then call ``.cpu()`` to move the result back. Internally, ``.metal()`` will copy the input data from the CPU buffer to a GPU buffer with a GPU compatible memory format. When ``.cpu()`` is invoked, the GPU command buffer will be flushed and synced. After `forward` finished, the final result will then be copied back from the GPU buffer back to a CPU buffer. - -The last step we have to do is to add the ``Accelerate.framework`` and the ``MetalPerformanceShaders.framework`` to your xcode project (Open your project via XCode, go to your project target’s "General" tab, locate the "Frameworks, Libraries and Embedded Content" section and click the "+" button). - -If everything works fine, you should be able to see the inference results on your phone. - - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a mobilenetv2 model to a GPU compatible model. We walked through a HelloWorld example to show how to use the C++ APIs to run models on iOS GPU. Please be aware of that GPU feature is still under development, new operators will continue to be added. APIs are subject to change in the future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. - -Learn More ----------- - -- The `Mobilenetv2 `_ from Torchvision -- To learn more about how to use ``optimize_for_mobile``, please refer to the `Mobile Perf Recipe `_ diff --git a/prototype_source/nnapi_mobilenetv2.rst b/prototype_source/nnapi_mobilenetv2.rst deleted file mode 100644 index ed9548a387d..00000000000 --- a/prototype_source/nnapi_mobilenetv2.rst +++ /dev/null @@ -1,218 +0,0 @@ -(Beta) Convert MobileNetV2 to NNAPI -======================================== - -Introduction ------------- - -This tutorial shows how to prepare a computer vision model to use -`Android's Neural Networks API (NNAPI) `_. -NNAPI provides access to powerful and efficient computational cores -on many modern Android devices. - -PyTorch's NNAPI is currently in the "prototype" phase and only supports -a limited range of operators, but we expect to solidify the integration -and expand our operator support over time. - - -Environment ------------ - -Install PyTorch and torchvision. - -``pip install torch==1.10.0 torchvision==0.11.1`` - - -Model Preparation ------------------ - -First, we must prepare our model to execute with NNAPI. -This step runs on your training server or laptop. -The key conversion function to call is -``torch.backends._nnapi.prepare.convert_model_to_nnapi``, -but some extra steps are required to ensure that -the model is properly structured. -Most notably, quantizing the model is required -in order to run the model on certain accelerators. - -You can copy/paste this entire Python script and run it, -or make your own modifications. -By default, it will save the models to ``~/mobilenetv2-nnapi/``. -Please create that directory first. - -.. code:: python - - #!/usr/bin/env python - import sys - import os - import torch - import torch.utils.bundled_inputs - import torch.utils.mobile_optimizer - import torch.backends._nnapi.prepare - import torchvision.models.quantization.mobilenet - from pathlib import Path - - - # This script supports 3 modes of quantization: - # - "none": Fully floating-point model. - # - "core": Quantize the core of the model, but wrap it a - # quantizer/dequantizer pair, so the interface uses floating point. - # - "full": Quantize the model, and use quantized tensors - # for input and output. - # - # "none" maintains maximum accuracy - # "core" sacrifices some accuracy for performance, - # but maintains the same interface. - # "full" maximized performance (with the same accuracy as "core"), - # but requires the application to use quantized tensors. - # - # There is a fourth option, not supported by this script, - # where we include the quant/dequant steps as NNAPI operators. - def make_mobilenetv2_nnapi(output_dir_path, quantize_mode): - quantize_core, quantize_iface = { - "none": (False, False), - "core": (True, False), - "full": (True, True), - }[quantize_mode] - - model = torchvision.models.quantization.mobilenet.mobilenet_v2(pretrained=True, quantize=quantize_core) - model.eval() - - # Fuse BatchNorm operators in the floating point model. - # (Quantized models already have this done.) - # Remove dropout for this inference-only use case. - if not quantize_core: - model.fuse_model() - assert type(model.classifier[0]) == torch.nn.Dropout - model.classifier[0] = torch.nn.Identity() - - input_float = torch.zeros(1, 3, 224, 224) - input_tensor = input_float - - # If we're doing a quantized model, we need to trace only the quantized core. - # So capture the quantizer and dequantizer, use them to prepare the input, - # and replace them with identity modules so we can trace without them. - if quantize_core: - quantizer = model.quant - dequantizer = model.dequant - model.quant = torch.nn.Identity() - model.dequant = torch.nn.Identity() - input_tensor = quantizer(input_float) - - # Many NNAPI backends prefer NHWC tensors, so convert our input to channels_last, - # and set the "nnapi_nhwc" attribute for the converter. - input_tensor = input_tensor.contiguous(memory_format=torch.channels_last) - input_tensor.nnapi_nhwc = True - - # Trace the model. NNAPI conversion only works with TorchScript models, - # and traced models are more likely to convert successfully than scripted. - with torch.no_grad(): - traced = torch.jit.trace(model, input_tensor) - nnapi_model = torch.backends._nnapi.prepare.convert_model_to_nnapi(traced, input_tensor) - - # If we're not using a quantized interface, wrap a quant/dequant around the core. - if quantize_core and not quantize_iface: - nnapi_model = torch.nn.Sequential(quantizer, nnapi_model, dequantizer) - model.quant = quantizer - model.dequant = dequantizer - # Switch back to float input for benchmarking. - input_tensor = input_float.contiguous(memory_format=torch.channels_last) - - # Optimize the CPU model to make CPU-vs-NNAPI benchmarks fair. - model = torch.utils.mobile_optimizer.optimize_for_mobile(torch.jit.script(model)) - - # Bundle sample inputs with the models for easier benchmarking. - # This step is optional. - class BundleWrapper(torch.nn.Module): - def __init__(self, mod): - super().__init__() - self.mod = mod - def forward(self, arg): - return self.mod(arg) - nnapi_model = torch.jit.script(BundleWrapper(nnapi_model)) - torch.utils.bundled_inputs.augment_model_with_bundled_inputs( - model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)]) - torch.utils.bundled_inputs.augment_model_with_bundled_inputs( - nnapi_model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)]) - - # Save both models. - model._save_for_lite_interpreter(str(output_dir_path / ("mobilenetv2-quant_{}-cpu.pt".format(quantize_mode)))) - nnapi_model._save_for_lite_interpreter(str(output_dir_path / ("mobilenetv2-quant_{}-nnapi.pt".format(quantize_mode)))) - - - if __name__ == "__main__": - for quantize_mode in ["none", "core", "full"]: - make_mobilenetv2_nnapi(Path(os.environ["HOME"]) / "mobilenetv2-nnapi", quantize_mode) - - -Running Benchmarks ------------------- - -Now that the models are ready, we can benchmark them on our Android devices. -See `our performance recipe `_ for details. -The best-performing models are likely to be the "fully-quantized" models: -``mobilenetv2-quant_full-cpu.pt`` and ``mobilenetv2-quant_full-nnapi.pt``. - -Because these models have bundled inputs, we can run the benchmark as follows: - -.. code:: shell - - ./speed_benchmark_torch --pthreadpool_size=1 --model=mobilenetv2-quant_full-nnapi.pt --use_bundled_input=0 --warmup=5 --iter=200 - -Adjusting increasing the thread pool size can can reduce latency, -at the cost of increased CPU usage. -Omitting that argument will use one thread per big core. -The CPU models can get improved performance (at the cost of memory usage) -by passing ``--use_caching_allocator=true``. - - -Running model on host ---------------------- - -We can now run models on your linux machine using the reference implementation -of NNAPI. You need to build the NNAPI library from Android source code: - -* Make sure you have at least 200GB of disk space -* Follow `these instructions `_ to install ``repo`` - -.. code:: shell - - mkdir ~/android-nnapi && cd ~/android-nnapi - repo init -u https://android.googlesource.com/platform/manifest -b master - repo sync --network-only -j 16 - repo sync -l - . build/envsetup.sh - lunch aosp_x86_64-eng - mm -j16 out/host/linux-x86/lib64/libneuralnetworks.so - - -With the host build of ``libneuralnetworks.so`` you can run Pytorch NNAPI models on -your linux machine: - -.. code:: python - - #!/usr/bin/env python - import ctypes - import torch - from pathlib import Path - - ctypes.cdll.LoadLibrary(Path.home() / "android-nnapi/out/host/linux-x86/lib64/libneuralnetworks.so") - model = torch.jit.load(Path.home() / "mobilenetv2-nnapi/mobilenetv2-quant_full-nnapi.pt") - print(model(*model.get_all_bundled_inputs()[0])) - - -Integration ------------ - -The converted models are ordinary TorchScript models. -You can use them in your app just like any other PyTorch model. -See `https://pytorch.org/mobile/android/ `_ -for an introduction to using PyTorch on Android. - - -Learn More ----------- - -- Learn more about optimization in our - `Mobile Performance Recipe `_ -- `MobileNetV2 `_ from torchvision -- Information about `NNAPI `_ diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py deleted file mode 100644 index a630d27e6a6..00000000000 --- a/prototype_source/numeric_suite_tutorial.py +++ /dev/null @@ -1,420 +0,0 @@ -# -*- coding: utf-8 -*- -""" -PyTorch Numeric Suite Tutorial -============================== - -Introduction ------------- - -Quantization is good when it works, but it’s difficult to know what's wrong when it doesn't satisfy the accuracy we expect. Debugging the accuracy issue of quantization is not easy and time consuming. - -One important step of debugging is to measure the statistics of the float model and its corresponding quantized model to know where are they differ most. We built a suite of numeric tools called PyTorch Numeric Suite in PyTorch quantization to enable the measurement of the statistics between quantized module and float module to support quantization debugging efforts. Even for the quantized model with good accuracy, PyTorch Numeric Suite can still be used as the profiling tool to better understand the quantization error within the model and provide the guidance for further optimization. - -PyTorch Numeric Suite currently supports models quantized through both static quantization and dynamic quantization with unified APIs. - -In this tutorial we will first use ResNet18 as an example to show how to use PyTorch Numeric Suite to measure the statistics between static quantized model and float model in eager mode. Then we will use LSTM based sequence model as an example to show the usage of PyTorch Numeric Suite for dynamic quantized model. - -Numeric Suite for Static Quantization -------------------------------------- - -Setup -^^^^^^ -We’ll start by doing the necessary imports: -""" - -############################################################################## - -import numpy as np -import torch -import torch.nn as nn -import torchvision -from torchvision import models, datasets -import torchvision.transforms as transforms -import os -import torch.quantization -import torch.quantization._numeric_suite as ns -from torch.quantization import ( - default_eval_fn, - default_qconfig, - quantize, -) - -############################################################################## -# Then we load the pretrained float ResNet18 model, and quantize it into qmodel. We cannot compare two arbitrary models, only a float model and the quantized model derived from it can be compared. - - -float_model = torchvision.models.quantization.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1, quantize=False) -float_model.to('cpu') -float_model.eval() -float_model.fuse_model() -float_model.qconfig = torch.quantization.default_qconfig -img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)] -qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False) - -############################################################################## -# 1. Compare the weights of float and quantized models -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# The first thing we usually want to compare are the weights of quantized model and float model. -# We can call ``compare_weights()`` from PyTorch Numeric Suite to get a dictionary ``wt_compare_dict`` with key corresponding to module names and each entry is a dictionary with two keys 'float' and 'quantized', containing the float and quantized weights. -# ``compare_weights()`` takes in floating point and quantized state dict and returns a dict, with keys corresponding to the -# floating point weights and values being a dictionary of floating point and quantized weights - -wt_compare_dict = ns.compare_weights(float_model.state_dict(), qmodel.state_dict()) - -print('keys of wt_compare_dict:') -print(wt_compare_dict.keys()) - -print("\nkeys of wt_compare_dict entry for conv1's weight:") -print(wt_compare_dict['conv1.weight'].keys()) -print(wt_compare_dict['conv1.weight']['float'].shape) -print(wt_compare_dict['conv1.weight']['quantized'].shape) - - -############################################################################## -# Once get ``wt_compare_dict``, users can process this dictionary in whatever way they want. Here as an example we compute the quantization error of the weights of float and quantized models as following. -# Compute the Signal-to-Quantization-Noise Ratio (SQNR) of the quantized tensor ``y``. The SQNR reflects the -# relationship between the maximum nominal signal strength and the quantization error introduced in the -# quantization. Higher SQNR corresponds to lower quantization error. - -def compute_error(x, y): - Ps = torch.norm(x) - Pn = torch.norm(x-y) - return 20*torch.log10(Ps/Pn) - -for key in wt_compare_dict: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize())) - -############################################################################## -# As another example ``wt_compare_dict`` can also be used to plot the histogram of the weights of floating point and quantized models. - -import matplotlib.pyplot as plt - -f = wt_compare_dict['conv1.weight']['float'].flatten() -plt.hist(f, bins = 100) -plt.title("Floating point model weights of conv1") -plt.show() - -q = wt_compare_dict['conv1.weight']['quantized'].flatten().dequantize() -plt.hist(q, bins = 100) -plt.title("Quantized model weights of conv1") -plt.show() - - - -############################################################################## -# -# 2. Compare float point and quantized models at corresponding locations -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# The second tool allows for comparison of weights and activations between float and quantized models at corresponding locations for the same input as shown in the figure below. Red arrows indicate the locations of the comparison. -# -# .. figure:: /_static/img/compare_output.png -# -# We call ``compare_model_outputs()`` from PyTorch Numeric Suite to get the activations in float model and quantized model at corresponding locations for the given input data. This API returns a dict with module names being keys. Each entry is itself a dict with two keys 'float' and 'quantized' containing the activations. -data = img_data[0][0] - -# Take in floating point and quantized model as well as input data, and returns a dict, with keys -# corresponding to the quantized module names and each entry being a dictionary with two keys 'float' and -# 'quantized', containing the activations of floating point and quantized model at matching locations. -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data) - -print('keys of act_compare_dict:') -print(act_compare_dict.keys()) - -print("\nkeys of act_compare_dict entry for conv1's output:") -print(act_compare_dict['conv1.stats'].keys()) -print(act_compare_dict['conv1.stats']['float'][0].shape) -print(act_compare_dict['conv1.stats']['quantized'][0].shape) - -############################################################################## -# This dict can be used to compare and compute the quantization error of the activations of float and quantized models as following. -for key in act_compare_dict: - print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize())) - -############################################################################## -# If we want to do the comparison for more than one input data, we can do the following. -# Prepare the model by attaching the logger to both floating point module and quantized -# module if they are in the ``white_list``. Default logger is ``OutputLogger``, and default white_list -# is ``DEFAULT_NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_WHITE_LIST`` -ns.prepare_model_outputs(float_model, qmodel) - -for data in img_data: - float_model(data[0]) - qmodel(data[0]) - -# Find the matching activation between floating point and quantized modules, and return a dict with key -# corresponding to quantized module names and each entry being a dictionary with two keys 'float' -# and 'quantized', containing the matching floating point and quantized activations logged by the logger -act_compare_dict = ns.get_matching_activations(float_model, qmodel) - - -############################################################################## -# The default logger used in above APIs is ``OutputLogger``, which is used to log the outputs of the modules. We can inherit from base ``Logger`` class and create our own logger to perform different functionalities. For example we can make a new ``MyOutputLogger`` class as below. - -class MyOutputLogger(ns.Logger): - r"""Customized logger class - """ - - def __init__(self): - super(MyOutputLogger, self).__init__() - - def forward(self, x): - # Custom functionalities - # ... - return x - -############################################################################## -# And then we can pass this logger into above APIs such as: - -data = img_data[0][0] -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data, logger_cls=MyOutputLogger) - -############################################################################## -# or: - -ns.prepare_model_outputs(float_model, qmodel, MyOutputLogger) -for data in img_data: - float_model(data[0]) - qmodel(data[0]) -act_compare_dict = ns.get_matching_activations(float_model, qmodel) - - - -############################################################################## -# -# 3. Compare a module in a quantized model with its float point equivalent, with the same input data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# The third tool allows for comparing a quantized module in a model with its float point counterpart, feeding both of them the same input and comparing their outputs as shown below. -# -# .. figure:: /_static/img/compare_stub.png -# -# In practice we call prepare_model_with_stubs() to swap the quantized module that we want to compare with the Shadow module, which is illustrated as below: -# -# .. figure:: /_static/img/shadow.png -# -# The Shadow module takes quantized module, float module and logger as input, and creates a forward path inside to make the float module to shadow quantized module sharing the same input tensor. -# -# The logger can be customizable, default logger is ``ShadowLogger`` and it will save the outputs of the quantized module and float module that can be used to compute the module level quantization error. -# -# Notice before each call of ``compare_model_outputs()`` and ``compare_model_stub()`` we need to have clean float and quantized model. This is because ``compare_model_outputs()`` and ``compare_model_stub()`` modify float and quantized model inplace, and it will cause unexpected results if call one right after another. - -float_model = torchvision.models.quantization.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1, quantize=False) -float_model.to('cpu') -float_model.eval() -float_model.fuse_model() -float_model.qconfig = torch.quantization.default_qconfig -img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)] -qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False) - -############################################################################## -# In the following example we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare ``QuantizableBasicBlock`` module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. - -data = img_data[0][0] -module_swap_list = [torchvision.models.quantization.resnet.QuantizableBasicBlock] - -# Takes in floating point and quantized model as well as input data, and returns a dict with key -# corresponding to module names and each entry being a dictionary with two keys 'float' and -# 'quantized', containing the output tensors of quantized module and its matching floating point shadow module. -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data) - -print('keys of ob_dict:') -print(ob_dict.keys()) - -print("\nkeys of ob_dict entry for layer1.0's output:") -print(ob_dict['layer1.0.stats'].keys()) -print(ob_dict['layer1.0.stats']['float'][0].shape) -print(ob_dict['layer1.0.stats']['quantized'][0].shape) - -############################################################################## -# This dict can be then used to compare and compute the module level quantization error. - -for key in ob_dict: - print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0].dequantize())) - -############################################################################## -# If we want to do the comparison for more than one input data, we can do the following. - -ns.prepare_model_with_stubs(float_model, qmodel, module_swap_list, ns.ShadowLogger) -for data in img_data: - qmodel(data[0]) -ob_dict = ns.get_logger_dict(qmodel) - -############################################################################## -# The default logger used in above APIs is ``ShadowLogger``, which is used to log the outputs of the quantized module and its matching float shadow module. We can inherit from base ``Logger`` class and create our own logger to perform different functionalities. For example we can make a new ``MyShadowLogger`` class as below. - -class MyShadowLogger(ns.Logger): - r"""Customized logger class - """ - - def __init__(self): - super(MyShadowLogger, self).__init__() - - def forward(self, x, y): - # Custom functionalities - # ... - return x - -############################################################################## -# And then we can pass this logger into above APIs such as: - -data = img_data[0][0] -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data, logger_cls=MyShadowLogger) - -############################################################################## -# or: - -ns.prepare_model_with_stubs(float_model, qmodel, module_swap_list, MyShadowLogger) -for data in img_data: - qmodel(data[0]) -ob_dict = ns.get_logger_dict(qmodel) - -############################################################################### -# Numeric Suite for Dynamic Quantization -# -------------------------------------- -# -# Numeric Suite APIs are designed in such as way that they work for both dynamic quantized model and static quantized model. We will use a model with both LSTM and Linear modules to demonstrate the usage of Numeric Suite on dynamic quantized model. This model is the same one used in the tutorial of dynamic quantization on LSTM word language model [1]. -# - -################################# -# Setup -# ^^^^^^ -# First we define the model as below. Notice that within this model only ``nn.LSTM`` and ``nn.Linear`` modules will be quantized dynamically and ``nn.Embedding`` will remain as floating point module after quantization. - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.encoder(input) - output, hidden = self.rnn(emb, hidden) - decoded = self.decoder(output) - return decoded, hidden - - def init_hidden(self, bsz): - weight = next(self.parameters()) - return (weight.new_zeros(self.nlayers, bsz, self.nhid), - weight.new_zeros(self.nlayers, bsz, self.nhid)) - -############################################################################## -# Then we create the ``float_model`` and quantize it into qmodel. - -ntokens = 10 - -float_model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -float_model.eval() - -qmodel = torch.quantization.quantize_dynamic( - float_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -############################################################################## -# -# 1. Compare the weights of float and quantized models -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We first call ``compare_weights()`` from PyTorch Numeric Suite to get a dictionary ``wt_compare_dict`` with key corresponding to module names and each entry is a dictionary with two keys 'float' and 'quantized', containing the float and quantized weights. - -wt_compare_dict = ns.compare_weights(float_model.state_dict(), qmodel.state_dict()) - -############################################################################## -# Once we get ``wt_compare_dict``, it can be used to compare and compute the quantization error of the weights of float and quantized models as following. - -for key in wt_compare_dict: - if wt_compare_dict[key]['quantized'].is_quantized: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize())) - else: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'])) - -############################################################################## -# -# The Inf value in ``encoder.weight`` entry above is because encoder module is not quantized and the weights are the same in both floating point and quantized models. -# -# 2. Compare float point and quantized models at corresponding locations -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Then we call ``compare_model_outputs()`` from PyTorch Numeric Suite to get the activations in float model and quantized model at corresponding locations for the given input data. This API returns a dict with module names being keys. Each entry is itself a dict with two keys 'float' and 'quantized' containing the activations. Notice that this sequence model has two inputs, and we can pass both inputs into ``compare_model_outputs()`` and ``compare_model_stub()``. - - -input_ = torch.randint(ntokens, (1, 1), dtype=torch.long) -hidden = float_model.init_hidden(1) - -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, input_, hidden) -print(act_compare_dict.keys()) - -############################################################################## -# This dict can be used to compare and compute the quantization error of the activations of float and quantized models as following. The LSTM module in this model has two outputs, in this example we compute the error of the first output. - - -for key in act_compare_dict: - print(key, compute_error(act_compare_dict[key]['float'][0][0], act_compare_dict[key]['quantized'][0][0])) - -############################################################################## -# -# 3. Compare a module in a quantized model with its float point equivalent, with the same input data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Next we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare LSTM and Linear module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. -# -# We reset the model first. - - -float_model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) -float_model.eval() - -qmodel = torch.quantization.quantize_dynamic( - float_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -############################################################################## -# Next we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare LSTM and Linear module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. - -module_swap_list = [nn.Linear, nn.LSTM] -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, input_, hidden) -print(ob_dict.keys()) - -############################################################################## -# This dict can be then used to compare and compute the module level quantization error. - -for key in ob_dict: - print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0])) - -############################################################################## -# SQNR of 40 dB is high and this is a situation where we have very good numerical alignment between the floating point and quantized model. -# -# Conclusion -# ---------- -# In this tutorial, we demonstrated how to use PyTorch Numeric Suite to measure and compare the statistics between quantized model and float model in eager mode with unified APIs for both static quantization and dynamic quantization. -# -# Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. -# -# References -# ---------- -# [1] `DYNAMIC QUANTIZATION ON AN LSTM WORD LANGUAGE MODEL `_. diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst deleted file mode 100644 index 8d965194f88..00000000000 --- a/prototype_source/prototype_index.rst +++ /dev/null @@ -1,258 +0,0 @@ -PyTorch Prototype Recipes ---------------------------------------------- -Prototype features are not available as part of binary distributions like PyPI or Conda (except maybe behind run-time flags). To test these features we would, depending on the feature, recommend building from master or using the nightly wheels that are made available on `pytorch.org `_. - -*Level of commitment*: We are committing to gathering high bandwidth feedback only on these features. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. - - -.. raw:: html - -
-
- -
- - - -
- -
- -
-
- -.. Add prototype tutorial cards below this line - -.. Quantization - -.. customcarditem:: - :header: FX Graph Mode Quantization User Guide - :card_description: Learn about FX Graph Mode Quantization. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_quant_guide.html - :tags: FX,Quantization - -.. customcarditem:: - :header: FX Graph Mode Post Training Dynamic Quantization - :card_description: Learn how to do post training dynamic quantization in graph mode based on torch.fx. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_ptq_dynamic.html - :tags: FX,Quantization - -.. customcarditem:: - :header: FX Graph Mode Post Training Static Quantization - :card_description: Learn how to do post training static quantization in graph mode based on torch.fx. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_ptq_static.html - :tags: FX,Quantization - -.. customcarditem:: - :header: Graph Mode Dynamic Quantization on BERT - :card_description: Learn how to do post training dynamic quantization with graph mode quantization on BERT models. - :image: ../_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png - :link: ../prototype/graph_mode_dynamic_bert_tutorial.html - :tags: Text,Quantization - -.. customcarditem:: - :header: PyTorch Numeric Suite Tutorial - :card_description: Learn how to use the PyTorch Numeric Suite to support quantization debugging efforts. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/numeric_suite_tutorial.html - :tags: Debugging,Quantization - -.. customcarditem:: - :header: How to Write a Quantizer for PyTorch 2 Export Quantization - :card_description: Learn how to implement a Quantizer for PT2 Export Quantization - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quantizer.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Post Training Quantization - :card_description: Learn how to use Post Training Quantization in PyTorch 2 Export. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_ptq.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization-Aware Training - :card_description: Learn how to use Quantization-Aware-Training in PyTorch 2 Export. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_qat.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization with X86 Backend through Inductor - :card_description: Learn how to use PT2 Export Quantization with X86 Backend through Inductor. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_x86_inductor.html - :tags: Quantization - -.. Sparsity - -.. customcarditem:: - :header: (prototype) Accelerating BERT with semi-structured (2:4) sparsity - :card_description: Prune BERT to be 2:4 sparse and accelerate for inference. - :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: prototype/semi_structured_sparse.html - :tags: Model-Optimiziation - -.. Mobile - -.. customcarditem:: - :header: Use iOS GPU in PyTorch - :card_description: Learn how to run your models on iOS GPU. - :image: ../_static/img/thumbnails/cropped/ios.png - :link: ../prototype/ios_gpu_workflow.html - :tags: Mobile - -.. customcarditem:: - :header: Convert MobileNetV2 to NNAPI - :card_description: Learn how to prepare a computer vision model to use Android’s Neural Networks API (NNAPI). - :image: ../_static/img/thumbnails/cropped/android.png - :link: ../prototype/nnapi_mobilenetv2.html - :tags: Mobile - -.. customcarditem:: - :header: PyTorch Vulkan Backend User Workflow - :card_description: Learn how to use the Vulkan backend on mobile GPUs. - :image: ../_static/img/thumbnails/cropped/android.png - :link: ../prototype/vulkan_workflow.html - :tags: Mobile - -.. customcarditem:: - :header: Tracing-based Selective Build Mobile Interpreter in Android and iOS - :card_description: Learn how to optimize the mobile interpreter size with a tracing-based selective build. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../prototype/tracing_based_selective_build.html - :tags: Mobile - -.. customcarditem:: - :header: Convert Mobilenetv2 to Core ML - :card_description: Learn how to prepare a computer vision model to use the PyTorch Core ML mobile backend. - :image: ../_static/img/thumbnails/cropped/ios.png - :link: ../prototype/ios_coreml_workflow.html - :tags: Mobile - -.. Modules - -.. customcarditem:: - :header: Skipping Module Parameter Initialization in PyTorch 1.10 - :card_description: Describes skipping parameter initialization during module construction in PyTorch 1.10, avoiding wasted computation. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/skip_param_init.html - :tags: Modules - -.. TorchScript - -.. customcarditem:: - :header: Model Freezing in TorchScript - :card_description: Freezing is the process of inlining Pytorch module parameters and attributes values into the TorchScript internal representation. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/torchscript_freezing.html - :tags: TorchScript - -.. vmap - -.. customcarditem:: - :header: Using torch.vmap - :card_description: Learn about torch.vmap, an autovectorizer for PyTorch operations. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/vmap_recipe.html - :tags: vmap - -.. NestedTensor - -.. customcarditem:: - :header: Nested Tensor - :card_description: Learn about nested tensors, the new way to batch heterogeneous-length data - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/nestedtensor.html - :tags: NestedTensor - -.. MaskedTensor - -.. customcarditem:: - :header: MaskedTensor Overview - :card_description: Learn about masked tensors, the source of truth for specified and unspecified values - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/maskedtensor_overview.html - :tags: MaskedTensor - -.. customcarditem:: - :header: Masked Tensor Sparsity - :card_description: Learn about how to leverage sparse layouts (e.g. COO and CSR) in MaskedTensor - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/maskedtensor_sparsity.html - :tags: MaskedTensor - -.. customcarditem:: - :header: Masked Tensor Advanced Semantics - :card_description: Learn more about Masked Tensor's advanced semantics (reductions and comparing vs. NumPy's MaskedArray) - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/maskedtensor_advanced_semantics.html - :tags: MaskedTensor - -.. customcarditem:: - :header: MaskedTensor: Simplifying Adagrad Sparse Semantics - :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/maskedtensor_adagrad.html - :tags: MaskedTensor - -.. Model-Optimization - -.. customcarditem:: - :header: Inductor Cpp Wrapper Tutorial - :card_description: Speed up your models with Inductor Cpp Wrapper - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/inductor_cpp_wrapper_tutorial.html - :tags: Model-Optimization - -.. End of tutorial card section - -.. raw:: html - -
- - - -
- -
- -.. ----------------------------------------- -.. Page TOC -.. ----------------------------------------- -.. toctree:: - :hidden: - - prototype/fx_graph_mode_quant_guide.html - prototype/fx_graph_mode_ptq_dynamic.html - prototype/fx_graph_mode_ptq_static.html - prototype/graph_mode_dynamic_bert_tutorial.html - prototype/inductor_cpp_wrapper_tutorial.html - prototype/pt2e_quantizer.html - prototype/pt2e_quant_ptq.html - prototype/pt2e_quant_qat.html - prototype/ios_gpu_workflow.html - prototype/nnapi_mobilenetv2.html - prototype/tracing_based_selective_build.html - prototype/ios_coreml_workflow.html - prototype/numeric_suite_tutorial.html - prototype/torchscript_freezing.html - prototype/vmap_recipe.html - prototype/vulkan_workflow.html - prototype/nestedtensor.html - prototype/maskedtensor_overview.html - prototype/maskedtensor_sparsity.html - prototype/maskedtensor_advanced_semantics.html - prototype/maskedtensor_adagrad.html diff --git a/prototype_source/pt2e_quant_ptq.rst b/prototype_source/pt2e_quant_ptq.rst deleted file mode 100644 index 7f46c86e42e..00000000000 --- a/prototype_source/pt2e_quant_ptq.rst +++ /dev/null @@ -1,596 +0,0 @@ -(prototype) PyTorch 2 Export Post Training Quantization -================================================================ -**Author**: `Jerry Zhang `_ - -This tutorial introduces the steps to do post training static quantization in -graph mode based on -`torch._export.export `_. Compared -to `FX Graph Mode Quantization `_, -this flow is expected to have significantly higher model coverage -(`88% on 14K models `_), -better programmability, and a simplified UX. - -Exportable by `torch.export.export` is a prerequisite to use the flow, you can -find what are the constructs that's supported in `Export DB `_. - -The high level architecture of quantization 2 with quantizer could look like -this: - -:: - - float_model(Python) Example Input - \ / - \ / - —------------------------------------------------------- - | export | - —------------------------------------------------------- - | - FX Graph in ATen Backend Specific Quantizer - | / - —-------------------------------------------------------- - | prepare_pt2e | - —-------------------------------------------------------- - | - Calibrate/Train - | - —-------------------------------------------------------- - | convert_pt2e | - —-------------------------------------------------------- - | - Quantized Model - | - —-------------------------------------------------------- - | Lowering | - —-------------------------------------------------------- - | - Executorch, Inductor or - - -The PyTorch 2 export quantization API looks like this: - -.. code:: python - - import torch - from torch._export import capture_pre_autograd_graph - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(5, 10) - - def forward(self, x): - return self.linear(x) - - - example_inputs = (torch.randn(1, 5),) - m = M().eval() - - # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - m = capture_pre_autograd_graph(m, *example_inputs) - # we get a model with aten ops - - - # Step 2. quantization - from torch.ao.quantization.quantize_pt2e import ( - prepare_pt2e, - convert_pt2e, - ) - - from torch.ao.quantization.quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - # backend developer will write their own Quantizer and expose methods to allow - # users to express how they - # want the model to be quantized - quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) - m = prepare_pt2e(m, quantizer) - - # calibration omitted - - m = convert_pt2e(m) - # we have a model with aten ops doing integer computations when possible - - -Motivation of PyTorch 2 Export Quantization ---------------------------------------------- - -In PyTorch versions prior to 2, we have FX Graph Mode Quantization that uses -`QConfigMapping `_ -and `BackendConfig `_ -for customizations. ``QConfigMapping`` allows modeling users to specify how -they want their model to be quantized, ``BackendConfig`` allows backend -developers to specify the supported ways of quantization in their backend. While -that API covers most use cases relatively well, it is not fully extensible. -There are two main limitations for the current API: - -* Limitation around expressing quantization intentions for complicated operator - patterns (how an operator pattern should be observed/quantized) using existing - objects: ``QConfig`` and ``QConfigMapping``. - -* Limited support on how user can express their intention of how they want - their model to be quantized. For example, if users want to quantize the every - other linear in the model, or the quantization behavior has some dependency on - the actual shape of the Tensor (for example, only observe/quantize inputs - and outputs when the linear has a 3D input), backend developer or modeling - users need to change the core quantization API/flow. - -A few improvements could make the existing flow better: - -* We use ``QConfigMapping`` and ``BackendConfig`` as separate objects, - ``QConfigMapping`` describes user’s intention of how they want their model to - be quantized, ``BackendConfig`` describes what kind of quantization a backend - supports. ``BackendConfig`` is backend-specific, but ``QConfigMapping`` is not, - and the user can provide a ``QConfigMapping`` that is incompatible with a specific - ``BackendConfig``, this is not a great UX. Ideally, we can structure this better - by making both configuration (``QConfigMapping``) and quantization capability - (``BackendConfig``) backend-specific, so there will be less confusion about - incompatibilities. -* In ``QConfig`` we are exposing observer/ ``fake_quant`` observer classes as an - object for the user to configure quantization, this increases the things that - the user may need to care about. For example, not only the ``dtype`` but also - how the observation should happen, these could potentially be hidden from the - user so that the user flow is simpler. - -Here is a summary of the benefits of the new API: - -- **Programmability** (addressing 1. and 2.): When a user’s quantization needs - are not covered by available quantizers, users can build their own quantizer and - compose it with other quantizers as mentioned above. -- **Simplified UX** (addressing 3.): Provides a single instance with which both - backend and users interact. Thus you no longer have the user facing quantization - config mapping to map users intent and a separate quantization config that - backends interact with to configure what backend support. We will still have a - method for users to query what is supported in a quantizer. With a single - instance, composing different quantization capabilities also becomes more - natural than previously. - - For example XNNPACK does not support ``embedding_byte`` - and we have natively support for this in ExecuTorch. Thus, if we had - ``ExecuTorchQuantizer`` that only quantized ``embedding_byte``, then it can be - composed with ``XNNPACKQuantizer``. (Previously, this used to be concatenating the - two ``BackendConfig`` together and since options in ``QConfigMapping`` are not - backend specific, user also need to figure out how to specify the configurations - by themselves that matches the quantization capabilities of the combined - backend. With a single quantizer instance, we can compose two quantizers and - query the composed quantizer for capabilities, which makes it less error prone - and cleaner, for example, ``composed_quantizer.quantization_capabilities())``. - -- **Separation of concerns** (addressing 4.): As we design the quantizer API, we - also decouple specification of quantization, as expressed in terms of ``dtype``, - min/max (# of bits), symmetric, and so on, from the observer concept. - Currently, the observer captures both quantization specification and how to - observe (Histogram vs MinMax observer). Modeling users are freed from - interacting with observer and fake quant objects with this change. - -Define Helper Functions and Prepare Dataset -------------------------------------------- - -We’ll start by doing the necessary imports, defining some helper functions and -prepare the data. These steps are identitcal to -`Static Quantization with Eager Mode in PyTorch `_. - -To run the code in this tutorial using the entire ImageNet dataset, first -download Imagenet by following the instructions at here -`ImageNet Data `_. Unzip the downloaded file -into the ``data_path`` folder. - -Download the `torchvision resnet18 model `_ -and rename it to ``data/resnet18_pretrained_float.pth``. - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - - def accuracy(output, target, topk=(1,)): - """ - Computes the accuracy over the k top predictions for the specified - values of k. - """ - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - - def evaluate(model, criterion, data_loader): - model.eval() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file) - model.load_state_dict(state_dict) - model.to("cpu") - return model - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 30 - eval_batch_size = 50 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cpu") - float_model.eval() - - # create another instance of the model since - # we need to keep the original model around - model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu") - -Set the model to eval mode --------------------------- - -For post training quantization, we'll need to set the model to the eval mode. - -.. code:: python - - model_to_quantize.eval() - -Export the model with torch.export ----------------------------------- - -Here is how you can use ``torch.export`` to export the model: - -.. code-block:: python - - from torch._export import capture_pre_autograd_graph - - example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) - # or capture with dynamic dimensions - # from torch._export import dynamic_dim - # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)]) - - -``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. - - -Import the Backend Specific Quantizer and Configure how to Quantize the Model ------------------------------------------------------------------------------ - -The following code snippets describes how to quantize the model: - -.. code-block:: python - - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config()) - -``Quantizer`` is backend specific, and each ``Quantizer`` will provide their -own way to allow users to configure their model. Just as an example, here is -the different configuration APIs supported by ``XNNPackQuantizer``: - -.. code-block:: python - - quantizer.set_global(qconfig_opt) # qconfig_opt is an optional quantization config - .set_object_type(torch.nn.Conv2d, qconfig_opt) # can be a module type - .set_object_type(torch.nn.functional.linear, qconfig_opt) # or torch functional op - .set_module_name("foo.bar", qconfig_opt) - -.. note:: - - Check out our - `tutorial `_ - that describes how to write a new ``Quantizer``. - -Prepare the Model for Post Training Quantization ----------------------------------------------------------- - -``prepare_pt2e`` folds ``BatchNorm`` operators into preceding ``Conv2d`` -operators, and inserts observers in appropriate places in the model. - -.. code-block:: python - - prepared_model = prepare_pt2e(exported_model, quantizer) - print(prepared_model.graph) - -Calibration --------------- - -The calibration function is run after the observers are inserted in the model. -The purpose for calibration is to run through some sample examples that is -representative of the workload (for example a sample of the training data set) -so that the observers in themodel are able to observe the statistics of the -Tensors and we can later use this information to calculate quantization -parameters. - -.. code-block:: python - - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Convert the Calibrated Model to a Quantized Model -------------------------------------------------- - -``convert_pt2e`` takes a calibrated model and produces a quantized model. - -.. code-block:: python - - quantized_model = convert_pt2e(prepared_model) - print(quantized_model) - -At this step, we currently have two representations that you can choose from, but exact representation -we offer in the long term might change based on feedback from PyTorch users. - -* Q/DQ Representation (default) - - Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. - -.. code-block:: python - - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) - weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8) - weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]); - out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted) - out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( - out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) - return out_i8 - -* Reference Quantized Model Representation (available in the nightly build) - - We will have a special representation for selected ops, for example, quantized linear. Other ops are represented as ``dq -> float32_op -> q`` and ``q/dq`` are decomposed into more primitive operators. - You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. - -.. code-block:: python - - # Reference Quantized Pattern for quantized linear - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_int16 = x_int8.to(torch.int16) - weight_int16 = weight_int8.to(torch.int16) - acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) - bias_scale = x_scale * weight_scale - bias_int32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) - acc_int32 = acc_int32 + bias_int32 - acc_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) + output_zero_point - out_int8 = torch.ops.aten.clamp(acc_int32, qmin, qmax).to(torch.int8) - return out_int8 - - -See `here `_ for the most up-to-date reference representations. - - -Checking Model Size and Accuracy Evaluation ----------------------------------------------- - -Now we can compare the size and model accuracy with baseline model. - -.. code-block:: python - - # Baseline model size and accuracy - scripted_float_model_file = "resnet18_scripted.pth" - - print("Size of baseline model") - print_size_of_model(float_model) - - top1, top5 = evaluate(float_model, criterion, data_loader_test) - print("Baseline Float Model Evaluation accuracy: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - # Quantized model size and accuracy - print("Size of model after quantization") - print_size_of_model(quantized_model) - - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - -.. note:: - We can't do performance evaluation now since the model is not lowered to - target device, it's just a representation of quantized computation in ATen - operators. - -.. note:: - The weights are still in fp32 right now, we may do constant propagation for quantize op to - get integer weights in the future. - -If you want to get better accuracy or performance, try configuring -``quantizer`` in different ways, and each ``quantizer`` will have its own way -of configuration, so please consult the documentation for the -quantizer you are using to learn more about how you can have more control -over how to quantize a model. - -Save and Load Quantized Model ---------------------------------- - -We'll show how to save and load the quantized model. - - -.. code-block:: python - - # 0. Store reference output, for example, inputs, and check evaluation accuracy: - example_inputs = (next(iter(data_loader))[0],) - ref = quantized_model(*example_inputs) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - # 1. Export the model and Save ExportedProgram - pt2e_quantized_model_file_path = saved_model_dir + "resnet18_pt2e_quantized.pth" - # capture the model to get an ExportedProgram - quantized_ep = torch.export.export(quantized_model, example_inputs) - # use torch.export.save to save an ExportedProgram - torch.export.save(quantized_ep, pt2e_quantized_model_file_path) - - - # 2. Load the saved ExportedProgram - loaded_quantized_ep = torch.export.load(pt2e_quantized_model_file_path) - loaded_quantized_model = loaded_quantized_ep.module() - - # 3. Check results for example inputs and check evaluation accuracy again: - res = loaded_quantized_model(*example_inputs) - print("diff:", ref - res) - - top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test) - print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - -Output: - - -.. code-block:: python - - [before serialization] Evaluation accuracy on test dataset: 79.82, 94.55 - diff: tensor([[0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - ..., - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.]]) - - [after serialization/deserialization] Evaluation accuracy on test dataset: 79.82, 94.55 - - -Debugging the Quantized Model ------------------------------- - -You can use `Numeric Suite `_ -that can help with debugging in eager mode and FX graph mode. The new version of -Numeric Suite working with PyTorch 2 Export models is still in development. - -Lowering and Performance Evaluation ------------------------------------- - -The model produced at this point is not the final model that runs on the device, -it is a reference quantized model that captures the intended quantized computation -from the user, expressed as ATen operators and some additional quantize/dequantize operators, -to get a model that runs on real devices, we'll need to lower the model. -For example, for the models that run on edge devices, we can lower with delegation and ExecuTorch runtime -operators. - -Conclusion --------------- - -In this tutorial, we went through the overall quantization flow in PyTorch 2 -Export Quantization using ``XNNPACKQuantizer`` and got a quantized model that -could be further lowered to a backend that supports inference with XNNPACK -backend. To use this for your own backend, please first follow the -`tutorial `__ and -implement a ``Quantizer`` for your backend, and then quantize the model with -that ``Quantizer``. diff --git a/prototype_source/pt2e_quant_qat.rst b/prototype_source/pt2e_quant_qat.rst deleted file mode 100644 index 6d995d368e0..00000000000 --- a/prototype_source/pt2e_quant_qat.rst +++ /dev/null @@ -1,476 +0,0 @@ -(prototype) PyTorch 2 Export Quantization-Aware Training (QAT) -================================================================ -**Author**: `Andrew Or `_ - -This tutorial shows how to perform quantization-aware training (QAT) in -graph mode based on `torch.export.export `_. -For more details about PyTorch 2 Export Quantization in general, refer -to the `post training quantization tutorial `_. - -The PyTorch 2 Export QAT flow looks like the following—it is similar -to the post training quantization (PTQ) flow for the most part: - -.. code:: python - - import torch - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantize_pt2e import ( - prepare_qat_pt2e, - convert_pt2e, - ) - from torch.ao.quantization.quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(5, 10) - - def forward(self, x): - return self.linear(x) - - - example_inputs = (torch.randn(1, 5),) - m = M() - - # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - m = capture_pre_autograd_graph(m, *example_inputs) - # we get a model with aten ops - - # Step 2. quantization-aware training - # backend developer will write their own Quantizer and expose methods to allow - # users to express how they want the model to be quantized - quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) - m = prepare_qat_pt2e(m, quantizer) - - # train omitted - - m = convert_pt2e(m) - # we have a model with aten ops doing integer computations when possible - - # move the quantized model to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(m) - -Note that calling ``model.eval()`` or ``model.train()`` after program capture is -not allowed, because these methods no longer correctly change the behavior of -certain ops like dropout and batch normalization. Instead, please use -``torch.ao.quantization.move_exported_model_to_eval()`` and -``torch.ao.quantization.move_exported_model_to_train()`` (coming soon) -respectively. - - -Define Helper Functions and Prepare the Dataset ------------------------------------------------ - -To run the code in this tutorial using the entire ImageNet dataset, first -download ImageNet by following the instructions in -`ImageNet Data `_. Unzip the downloaded file -into the ``data_path`` folder. - -Next, download the `torchvision resnet18 model `_ -and rename it to ``data/resnet18_pretrained_float.pth``. - -We’ll start by doing the necessary imports, defining some helper functions and -prepare the data. These steps are very similar to the ones defined in the -`static eager mode post training quantization tutorial `_: - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - def accuracy(output, target, topk=(1,)): - """ - Computes the accuracy over the k top predictions for the specified - values of k. - """ - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - def evaluate(model, criterion, data_loader, device): - torch.ao.quantization.move_exported_model_to_eval(model) - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - image = image.to(device) - target = target.to(device) - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file) - model.load_state_dict(state_dict) - return model - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches): - # Note: do not call model.train() here, since this doesn't work on an exported model. - # Instead, call `torch.ao.quantization.move_exported_model_to_train(model)`, which will - # be added in the near future - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - avgloss = AverageMeter('Loss', '1.5f') - - cnt = 0 - for image, target in data_loader: - start_time = time.time() - print('.', end = '') - cnt += 1 - image, target = image.to(device), target.to(device) - output = model(image) - loss = criterion(output, target) - optimizer.zero_grad() - loss.backward() - optimizer.step() - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - avgloss.update(loss, image.size(0)) - if cnt >= ntrain_batches: - print('Loss', avgloss.avg) - - print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) - return - - print('Full imagenet train set: * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}' - .format(top1=top1, top5=top5)) - return - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 32 - eval_batch_size = 32 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cuda") - - -Export the model with torch.export ----------------------------------- - -Here is how you can use ``torch.export`` to export the model: - -.. code:: python - - from torch._export import capture_pre_autograd_graph - - example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph(float_model, example_inputs) - - -.. code:: python - - # or, to capture with dynamic dimensions: - from torch._export import dynamic_dim - - example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph( - float_model, - example_inputs, - constraints=[dynamic_dim(example_inputs[0], 0)], - ) -.. note:: - - ``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. - - -Import the Backend Specific Quantizer and Configure how to Quantize the Model ------------------------------------------------------------------------------ - -The following code snippets describe how to quantize the model: - -.. code-block:: python - - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config(is_qat=True)) - -``Quantizer`` is backend specific, and each ``Quantizer`` will provide their -own way to allow users to configure their model. - -.. note:: - - Check out our - `tutorial `_ - that describes how to write a new ``Quantizer``. - - -Prepare the Model for Quantization-Aware Training ----------------------------------------------------------- - -``prepare_qat_pt2e`` inserts fake quantizes in appropriate places in the model -and performs the appropriate QAT "fusions", such as ``Conv2d`` + ``BatchNorm2d``, -for better training accuracies. The fused operations are represented as a subgraph -of ATen ops in the prepared graph. - -.. code-block:: python - - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - print(prepared_model) - -.. note:: - - If your model contains batch normalization, the actual ATen ops you get - in the graph depend on the model's device when you export the model. - If the model is on CPU, then you'll get ``torch.ops.aten._native_batch_norm_legit``. - If the model is on CUDA, then you'll get ``torch.ops.aten.cudnn_batch_norm``. - However, this is not fundamental and may be subject to change in the future. - - Between these two ops, it has been shown that ``torch.ops.aten.cudnn_batch_norm`` - provides better numerics on models like MobileNetV2. To get this op, either - call ``model.cuda()`` before export, or run the following after prepare to manually - swap the ops: - - .. code:: python - - for n in prepared_model.graph.nodes: - if n.target == torch.ops.aten._native_batch_norm_legit.default: - n.target = torch.ops.aten.cudnn_batch_norm.default - prepared_model.recompile() - - In the future, we plan to consolidate the batch normalization ops such that - the above will no longer be necessary. - -Training Loop ------------------------------------------------------------------------------ - -The training loop is similar to the ones in previous versions of QAT. To achieve -better accuracies, you may optionally disable observers and updating batch -normalization statistics after a certain number of epochs, or evaluate the QAT -or the quantized model trained so far every ``N`` epochs. - -.. code:: python - - num_epochs = 10 - num_train_batches = 20 - num_eval_batches = 20 - num_observer_update_epochs = 4 - num_batch_norm_update_epochs = 3 - num_epochs_between_evals = 2 - - # QAT takes time and one needs to train over a few epochs. - # Train and check accuracy after each epoch - for nepoch in range(num_epochs): - train_one_epoch(prepared_model, criterion, optimizer, data_loader, "cuda", num_train_batches) - - # Optionally disable observer/batchnorm stats after certain number of epochs - if epoch >= num_observer_update_epochs: - print("Disabling observer for subseq epochs, epoch = ", epoch) - prepared_model.apply(torch.ao.quantization.disable_observer) - if epoch >= num_batch_norm_update_epochs: - print("Freezing BN for subseq epochs, epoch = ", epoch) - for n in prepared_model.graph.nodes: - # Args: input, weight, bias, running_mean, running_var, training, momentum, eps - # We set the `training` flag to False here to freeze BN stats - if n.target in [ - torch.ops.aten._native_batch_norm_legit.default, - torch.ops.aten.cudnn_batch_norm.default, - ]: - new_args = list(n.args) - new_args[5] = False - n.args = new_args - prepared_model.recompile() - - # Check the quantized accuracy every N epochs - # Note: If you wish to just evaluate the QAT model (not the quantized model), - # then you can just call `torch.ao.quantization.move_exported_model_to_eval/train`. - # However, the latter API is not ready yet and will be available in the near future. - if (nepoch + 1) % num_epochs_between_evals == 0: - prepared_model_copy = copy.deepcopy(prepared_model) - quantized_model = convert_pt2e(prepared_model_copy) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Epoch %d: Evaluation accuracy on %d images, %2.2f' % (nepoch, num_eval_batches * eval_batch_size, top1.avg)) - - -Saving and Loading Model Checkpoints ----------------------------------------------------------- - -Model checkpoints for the PyTorch 2 Export QAT flow are -the same as in any other training flow. They are useful for -pausing training and resuming it later, recovering from -failed training runs, and performing inference on different -machines at a later time. You can save model checkpoints -during or after training as follows: - -.. code:: python - - checkpoint_path = "/path/to/my/checkpoint_%s.pth" % nepoch - torch.save(prepared_model.state_dict(), "checkpoint_path") - -To load the checkpoints, you must export and prepare the -model the exact same way it was initially exported and -prepared. For example: - -.. code:: python - - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - from torchvision.models.resnet import resnet18 - - example_inputs = (torch.rand(2, 3, 224, 224),) - float_model = resnet18(pretrained=False) - exported_model = capture_pre_autograd_graph(float_model, example_inputs) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config(is_qat=True)) - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - prepared_model.load_state_dict(torch.load(checkpoint_path)) - - # resume training or perform inference - - -Convert the Trained Model to a Quantized Model ----------------------------------------------------------- - -``convert_pt2e`` takes a calibrated model and produces a quantized model. -Note that, before inference, you must first call -``torch.ao.quantization.move_exported_model_to_eval()`` to ensure certain ops -like dropout behave correctly in the eval graph. Otherwise, we would continue -to incorrectly apply dropout in the forward pass during inference, for example. - -.. code-block:: python - - quantized_model = convert_pt2e(prepared_model) - - # move certain ops like dropout to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(m) - - print(quantized_model) - - top1, top5 = evaluate(quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Final evaluation accuracy on %d images, %2.2f' % (num_eval_batches * eval_batch_size, top1.avg)) - -.. TODO: add results here - - -Conclusion --------------- - -In this tutorial, we demonstrated how to run Quantization-Aware Training (QAT) -flow in PyTorch 2 Export Quantization. After convert, the rest of the flow -is the same as Post-Training Quantization (PTQ); the user can -serialize/deserialize the model and further lower it to a backend that supports -inference with XNNPACK backend. For more detail, follow the -`PTQ tutorial `_. diff --git a/prototype_source/pt2e_quant_x86_inductor.rst b/prototype_source/pt2e_quant_x86_inductor.rst deleted file mode 100644 index f9836d6e371..00000000000 --- a/prototype_source/pt2e_quant_x86_inductor.rst +++ /dev/null @@ -1,313 +0,0 @@ -PyTorch 2 Export Quantization with X86 Backend through Inductor -================================================================== - -**Author**: `Leslie Fang `_, `Weiwen Xia `_, `Jiong Gong `_, `Jerry Zhang `_ - -Prerequisites ---------------- - -- `PyTorch 2 Export Post Training Quantization `_ -- `PyTorch 2 Export Quantization-Aware Training `_ -- `TorchInductor and torch.compile concepts in PyTorch `_ -- `Inductor C++ Wrapper concepts `_ - -Introduction --------------- - -This tutorial introduces the steps for utilizing the PyTorch 2 Export Quantization flow to generate a quantized model customized -for the x86 inductor backend and explains how to lower the quantized model into the inductor. - -The pytorch 2 export quantization flow uses the torch.export to capture the model into a graph and perform quantization transformations on top of the ATen graph. -This approach is expected to have significantly higher model coverage, better programmability, and a simplified UX. -TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels. - -This flow of quantization 2 with Inductor supports both static and dynamic quantization. Static quantization works best for CNN models, like ResNet-50. And dynamic quantization is more suitable for NLP models, like RNN and BERT. -For the difference between the two quantization types, please refer to the `following page `__. - -The quantization flow mainly includes three steps: - -- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. -- Step 2: Apply the Quantization flow based on the captured FX Graph, including defining the backend-specific quantizer, generating the prepared model with observers, - performing the prepared model's calibration or quantization-aware training, and converting the prepared model into the quantized model. -- Step 3: Lower the quantized model into inductor with the API ``torch.compile``. - -The high-level architecture of this flow could look like this: - -:: - - float_model(Python) Example Input - \ / - \ / - —-------------------------------------------------------- - | export | - —-------------------------------------------------------- - | - FX Graph in ATen - | X86InductorQuantizer - | / - —-------------------------------------------------------- - | prepare_pt2e | - | | | - | Calibrate/Train | - | | | - | convert_pt2e | - —-------------------------------------------------------- - | - Quantized Model - | - —-------------------------------------------------------- - | Lower into Inductor | - —-------------------------------------------------------- - | - Inductor - -Combining Quantization in PyTorch 2 Export and TorchInductor, we have flexibility and productivity with the new Quantization frontend -and outstanding out-of-box performance with the compiler backend. Especially on Intel fourth generation (SPR) Xeon processors which can -further boost the models' performance by leveraging the -`advanced-matrix-extensions `_ feature. - -Post Training Quantization ----------------------------- - -Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_ -for post training quantization. - -1. Capture FX Graph -^^^^^^^^^^^^^^^^^^^^^ - -We will start by performing the necessary imports, capturing the FX Graph from the eager module. - -:: - - import torch - import torchvision.models as models - import copy - from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e - import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer - from torch._export import capture_pre_autograd_graph - - # Create the Eager Model - model_name = "resnet18" - model = models.__dict__[model_name](pretrained=True) - - # Set the model to eval mode - model = model.eval() - - # Create the data, using the dummy data here as an example - traced_bs = 50 - x = torch.randn(traced_bs, 3, 224, 224).contiguous(memory_format=torch.channels_last) - example_inputs = (x,) - - # Capture the FX Graph to be quantized - with torch.no_grad(): - # if you are using the PyTorch nightlies or building from source with the pytorch master, - # use the API of `capture_pre_autograd_graph` - # Note 1: `capture_pre_autograd_graph` is also a short-term API, it will be updated to use the official `torch.export` API when that is ready. - exported_model = capture_pre_autograd_graph( - model, - example_inputs - ) - # Note 2: if you are using the PyTorch 2.1 release binary or building from source with the PyTorch 2.1 release branch, - # please use the API of `torch._dynamo.export` to capture the FX Graph. - # exported_model, guards = torch._dynamo.export( - # model, - # *copy.deepcopy(example_inputs), - # aten_graph=True, - # ) - - -Next, we will have the FX Module to be quantized. - -2. Apply Quantization -^^^^^^^^^^^^^^^^^^^^^^^ - -After we capture the FX Module to be quantized, we will import the Backend Quantizer for X86 CPU and configure how to -quantize the model. - -:: - - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config()) - -.. note:: - - The default quantization configuration in ``X86InductorQuantizer`` uses 8-bits for both activations and weights. - When Vector Neural Network Instruction is not available, the oneDNN backend silently chooses kernels that assume - `multiplications are 7-bit x 8-bit `_. In other words, potential - numeric saturation and accuracy issue may happen when running on CPU without Vector Neural Network Instruction. - -The quantization config is for static quantization by default. To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. - -.. code-block:: python - - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_dynamic=True)) - - -After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. -``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. - -:: - - prepared_model = prepare_pt2e(exported_model, quantizer) - -Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. This step is needed for static quantization only. - -:: - - # We use the dummy data as an example here - prepared_model(*example_inputs) - - # Alternatively: user can define the dataset to calibrate - # def calibrate(model, data_loader): - # model.eval() - # with torch.no_grad(): - # for image, target in data_loader: - # model(image) - # calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. - -:: - - converted_model = convert_pt2e(prepared_model) - -After these steps, we finished running the quantization flow and we will get the quantized model. - - -3. Lower into Inductor -^^^^^^^^^^^^^^^^^^^^^^^^ - -After we get the quantized model, we will further lower it to the inductor backend. The default Inductor wrapper -generates Python code to invoke both generated kernels and external kernels. Additionally, Inductor supports -C++ wrapper that generates pure C++ code. This allows seamless integration of the generated and external kernels, -effectively reducing Python overhead. In the future, leveraging the C++ wrapper, we can extend the capability -to achieve pure C++ deployment. For more comprehensive details about C++ Wrapper in general, please refer to the -dedicated tutorial on `Inductor C++ Wrapper Tutorial `_. - -:: - - # Optional: using the C++ wrapper instead of default Python wrapper - import torch._inductor.config as config - config.cpp_wrapper = True - -:: - - with torch.no_grad(): - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - -In a more advanced scenario, int8-mixed-bf16 quantization comes into play. In this instance, -a Convolution or GEMM operator produces BFloat16 output data type instead of Float32 in the absence -of a subsequent quantization node. Subsequently, the BFloat16 tensor seamlessly propagates through -subsequent pointwise operators, effectively minimizing memory usage and potentially enhancing performance. -The utilization of this feature mirrors that of regular BFloat16 Autocast, as simple as wrapping the -script within the BFloat16 Autocast context. - -:: - - with torch.autocast(device_type="cpu", dtype=torch.bfloat16, enabled=True), torch.no_grad(): - # Turn on Autocast to use int8-mixed-bf16 quantization. After lowering into Inductor CPP Backend, - # For operators such as QConvolution and QLinear: - # * The input data type is consistently defined as int8, attributable to the presence of a pair - of quantization and dequantization nodes inserted at the input. - # * The computation precision remains at int8. - # * The output data type may vary, being either int8 or BFloat16, contingent on the presence - # of a pair of quantization and dequantization nodes at the output. - # For non-quantizable pointwise operators, the data type will be inherited from the previous node, - # potentially resulting in a data type of BFloat16 in this scenario. - # For quantizable pointwise operators such as QMaxpool2D, it continues to operate with the int8 - # data type for both input and output. - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - -Put all these codes together, we will have the toy example code. -Please note that since the Inductor ``freeze`` feature does not turn on by default yet, run your example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_pytorch_2_1.py - -With PyTorch 2.1 release, all CNN models from TorchBench test suite have been measured and proven effective comparing with Inductor FP32 inference path. Please refer -to `this document `_ -for detail benchmark number. - -Quantization Aware Training ------------------------------ - -The PyTorch 2 Export Quantization-Aware Training (QAT) is now supported on X86 CPU using X86InductorQuantizer, -followed by the subsequent lowering of the quantized model into Inductor. -For a more in-depth understanding of PT2 Export Quantization-Aware Training, -we recommend referring to the dedicated `PyTorch 2 Export Quantization-Aware Training `_. - -The PyTorch 2 Export QAT flow is largely similar to the PTQ flow: - -.. code:: python - - import torch - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantize_pt2e import ( - prepare_qat_pt2e, - convert_pt2e, - ) - import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(1024, 1000) - - def forward(self, x): - return self.linear(x) - - example_inputs = (torch.randn(1, 1024),) - m = M() - - # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - exported_model = capture_pre_autograd_graph(m, example_inputs) - # we get a model with aten ops - - # Step 2. quantization-aware training - # Use Backend Quantizer for X86 CPU - # To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_qat=True)) - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - - # train omitted - - converted_model = convert_pt2e(prepared_model) - # we have a model with aten ops doing integer computations when possible - - # move the quantized model to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(converted_model) - - # Lower the model into Inductor - with torch.no_grad(): - optimized_model = torch.compile(converted_model) - _ = optimized_model(*example_inputs) - -Please note that the Inductor ``freeze`` feature is not enabled by default. -To use this feature, you need to run example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_qat.py - -Conclusion ------------- - -With this tutorial, we introduce how to use Inductor with X86 CPU in PyTorch 2 Quantization. Users can learn about -how to use ``X86InductorQuantizer`` to quantize a model and lower it into the inductor with X86 CPU devices. diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst deleted file mode 100644 index df666b1f6af..00000000000 --- a/prototype_source/pt2e_quantizer.rst +++ /dev/null @@ -1,381 +0,0 @@ -How to Write a ``Quantizer`` for PyTorch 2 Export Quantization -================================================================ - -**Author**: `Leslie Fang `_, `Weiwen Xia `__, `Jiong Gong `__, `Kimish Patel `__, `Jerry Zhang `__ - -Prerequisites: -^^^^^^^^^^^^^^^^ - -Required: - -- `Torchdynamo concepts in PyTorch `__ - -- `Quantization concepts in PyTorch `__ - -- `(prototype) PyTorch 2 Export Post Training Quantization `__ - -Optional: - -- `FX Graph Mode post training static quantization `__ - -- `BackendConfig in PyTorch Quantization FX Graph Mode `__ - -- `QConfig and QConfigMapping in PyTorch Quantization FX Graph Mode `__ - -Introduction -^^^^^^^^^^^^^ - -`(prototype) PyTorch 2 Export Post Training Quantization `__ introduced the overall API for pytorch 2 export quantization, main difference from fx graph mode quantization in terms of API is that we made it explicit that quantiation is targeting a specific backend. So to use the new flow, backend need to implement a ``Quantizer`` class that encodes: -(1). What is supported quantized operator or patterns in the backend -(2). How can users express the way they want their floating point model to be quantized, for example, quantized the whole model to be int8 symmetric quantization, or quantize only linear layers etc. - -Please see `here `__ For motivations for the new API and ``Quantizer``. - -An existing quantizer object defined for ``XNNPACK`` is in -`QNNPackQuantizer `__ - -Annotation API -^^^^^^^^^^^^^^^^^^^ - -``Quantizer`` uses annotation API to convey quantization intent for different operators/patterns. -Annotation API mainly consists of -`QuantizationSpec `__ -and -`QuantizationAnnotation `__. - -``QuantizationSpec`` is used to convey intent of how a tensor will be quantized, -e.g. dtype, bitwidth, min, max values, symmetric vs. asymmetric etc. -Furthermore, ``QuantizationSpec`` also allows quantizer to specify how a -tensor value should be observed, e.g. ``MinMaxObserver``, or ``HistogramObserver`` -, or some customized observer. - -``QuantizationAnnotation`` composed of ``QuantizationSpec`` objects is used to annotate input tensors -and output tensor of a pattern. Annotating input tensors is equivalent of annotating input edges, -while annotating output tensor is equivalent of annotating node. ``QuantizationAnnotation`` is a ``dataclass`` -with several fields: - -- ``input_qspec_map`` field is of class ``Dict`` to map each input tensor (as input edge) to a ``QuantizationSpec``. -- ``output_qspec`` field expresses the ``QuantizationSpec`` used to annotate the output tensor; -- ``_annotated`` field indicates if this node has already been annotated by quantizer. - -To conclude, annotation API requires quantizer to annotate edges (input tensors) or -nodes (output tensor) of the graph. Now, we will have a step-by-step tutorial for -how to use the annotation API with different types of ``QuantizationSpec``. - -1. Annotate Common Operator Patterns --------------------------------------------------------- - -In order to use the quantized pattern/operators, e.g. ``quantized add``, -backend developers will have intent to quantize (as expressed by ``QuantizationSpec``) -inputs, output of the pattern. Following is an example flow (take ``add`` operator as example) -of how this intent is conveyed in the quantization workflow with annotation API. - -- Step 1: Identify the original floating point pattern in the FX graph. There are - several ways to identify this pattern: Quantizer may use a pattern matcher - to match the operator pattern; Quantizer may go through the nodes from start to the end and compare - the node's target type to match the operator pattern. In this example, we can use the - `get_source_partitions `__ - to match this pattern. The original floating point ``add`` pattern only contain a single ``add`` node. - -:: - - add_partitions = get_source_partitions(gm.graph, [operator.add, torch.add]) - add_partitions = list(itertools.chain(*add_partitions.values())) - for add_partition in add_partitions: - add_node = add_partition.output_nodes[0] - -- Step 2: Define the ``QuantizationSpec`` for inputs and output of the pattern. ``QuantizationSpec`` - defines the ``data type``, ``qscheme``, and other quantization parameters about users' intent of - how to observe or fake quantize a tensor. - -:: - - act_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=-128, - quant_max=127, - qscheme=torch.per_tensor_affine, - is_dynamic=False, - observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12), - ) - - input_act_qspec = act_quantization_spec - output_act_qspec = act_quantization_spec - -- Step 3: Annotate the inputs and output of the pattern with ``QuantizationAnnotation``. - In this example, we will create the ``QuantizationAnnotation`` object with the ``QuantizationSpec`` - created in above step 2 for two inputs and one output of the ``add`` node. - -:: - - input_qspec_map = {} - input_act0 = add_node.args[0] - input_qspec_map[input_act0] = input_act_qspec - - input_act1 = add_node.args[1] - input_qspec_map[input_act1] = input_act_qspec - - add_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=output_act_qspec, - _annotated=True, - ) - -After we annotate the ``add`` node like this, in the following up quantization flow, ``HistogramObserver`` will -be inserted at its two input nodes and one output node in prepare phase. And ``HistogramObserver`` will be substituted with -``quantize`` node and ``dequantize`` node in the convert phase. - -2. Annotate Operators that Shares Quantization Params --------------------------------------------------------- - -It is natural that users want to annotate a quantized model where quantization -parameters can be shared among some tensors explicitly. Two typical use cases are: - -- Example 1: One example is for ``add`` where having both inputs sharing quantization - parameters makes operator implementation much easier. Without using of - `SharedQuantizationSpec `__, - we must annotate ``add`` as example in above section 1, in which two inputs of ``add`` - has different quantization parameters. -- Example 2: Another example is that of sharing quantization parameters between inputs and output. - This typically results from operators such as ``maxpool``, ``average_pool``, ``concat`` etc. - -``SharedQuantizationSpec`` is designed for this use case to annotate tensors whose quantization -parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is an ``EdgeOrNode`` object which -can be an input edge or an output value. - -.. note:: - - * Sharing is transitive - - Some tensors might be effectively using shared quantization spec due to: - - * Two nodes/edges are configured to use ``SharedQuantizationSpec``. - * There is existing sharing of some nodes. - - For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat`` - node: ``cat([conv1_out, conv2_out], ...)``. Let's say the output of ``conv1``, ``conv2``, and the first input of ``cat`` are configured - with the same configurations of ``QuantizationSpec``. The second input of ``cat`` is configured to use ``SharedQuantizationSpec`` - with the first input. - - .. code-block:: - - conv1_out: qspec1(dtype=torch.int8, ...) - conv2_out: qspec1(dtype=torch.int8, ...) - cat_input0: qspec1(dtype=torch.int8, ...) - cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat - - First of all, the output of ``conv1`` is implicitly sharing quantization parameters (and observer object) - with the first input of ``cat``, and the same is true for the output of ``conv2`` and the second input of ``cat``. - Therefore, since the user configures the two inputs of ``cat`` to share quantization parameters, by transitivity, - ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you - will see the following: - - .. code-block:: - - conv1 -> obs -> cat - conv2 -> obs / - - and both ``obs`` will be the same observer instance. - - -- Input edge is the connection between input node and the node consuming the input, - so it's a ``Tuple[Node, Node]``. -- Output value is an FX ``Node``. - -Now, if we want to rewrite ``add`` annotation example with ``SharedQuantizationSpec`` to indicate -two input tensors as sharing quantization parameters. We can define its ``QuantizationAnnotation`` -as this: - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``add`` pattern. -- Step 2: Annotate input_act0 of ``add`` with ``QuantizationSpec``. -- Step 3: Create a ``SharedQuantizationSpec`` object with input edge defined as ``(input_act0, add_node)`` which means to - share the observer used for this edge. Then, user can annotate input_act1 with this ``SharedQuantizationSpec`` - object. - -:: - - input_qspec_map = {} - share_qparams_with_input_act0_qspec = SharedQuantizationSpec((input_act0, add_node)) - input_qspec_map = {input_act0: act_quantization_spec, input_act1: share_qparams_with_input_act0_qspec} - - add_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=act_quantization_spec, - _annotated=True, - ) - -3. Annotate Operators with Fixed Quantization Parameters ---------------------------------------------------------- - -Another typical use case to annotate a quantized model is for tensors whose -quantization parameters are known beforehand. For example, operator like ``sigmoid``, which has -predefined and fixed scale/zero_point at input and output tensors. -`FixedQParamsQuantizationSpec `__ -is designed for this use case. To use ``FixedQParamsQuantizationSpec``, users need to pass in parameters -of ``scale`` and ``zero_point`` explicitly. - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``sigmoid`` pattern. -- Step 2: Create ``FixedQParamsQuantizationSpec`` object with inputs of fixed ``scale``, ``zero_point`` value. - These values will be used to create the ``quantize`` node and ``dequantize`` node in the convert phase. -- Step 3: Annotate inputs and output to use this ``FixedQParamsQuantizationSpec`` object. - -:: - - act_qspec = FixedQParamsQuantizationSpec( - dtype=torch.uint8, - quant_min=0, - quant_max=255, - qscheme=torch.per_tensor_affine, - scale=1.0 / 256.0, - zero_point=0, - ) - sigmoid_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map={input_act: act_qspec}, - output_qspec=act_qspec, - _annotated=True, - ) - -4. Annotate Tensors with Derived Quantization Parameters ---------------------------------------------------------------- - -Another use case is to define the constraint for tensors whose quantization parameters are derived from other tensors. -For example, if we want to annotate a convolution node, and define the ``scale`` of its bias input tensor -as product of the activation tensor's ``scale`` and weight tensor's ``scale``. We can use -`DerivedQuantizationSpec `__ -to annotate this conv node. - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``convolution`` pattern. -- Step 2: Define ``derive_qparams_fn`` function, it accepts list of ``ObserverOrFakeQuantize`` ( - `ObserverBase `__ - or `FakeQuantizeBase `__) - as input. From each ``ObserverOrFakeQuantize`` object, user can get the ``scale``, ``zero point`` value. - User can define its heuristic about how to derive new ``scale``, ``zero point`` value based on the - quantization parameters calculated from the observer or fake quant instances. -- Step 3: Define ``DerivedQuantizationSpec`` obejct, it accepts inputs of: list of ``EdgeOrNode`` objects. - The observer corresponding to each ``EdgeOrNode`` object will be passed into the ``derive_qparams_fn`` function; - ``derive_qparams_fn`` function; several other quantization parameters such as ``dtype``, ``qscheme``. -- Step 4: Annotate the inputs and output of this conv node with ``QuantizationAnnotation``. - -:: - - def derive_qparams_fn(obs_or_fqs: List[ObserverOrFakeQuantize]) -> Tuple[Tensor, Tensor]: - assert len(obs_or_fqs) == 2, \ - "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(len(obs_or_fq)) - act_obs_or_fq = obs_or_fqs[0] - weight_obs_or_fq = obs_or_fqs[1] - act_scale, act_zp = act_obs_or_fq.calculate_qparams() - weight_scale, weight_zp = weight_obs_or_fq.calculate_qparams() - return torch.tensor([act_scale * weight_scale]).to(torch.float32), torch.tensor([0]).to(torch.int32) - - bias_qspec = DerivedQuantizationSpec( - derived_from=[(input_act, node), (weight, node)], - derive_qparams_fn=derive_qparams_fn, - dtype=torch.int32, - quant_min=-2**31, - quant_max=2**31 - 1, - qscheme=torch.per_tensor_symmetric, - ) - input_qspec_map = {input_act: act_quantization_spec, weight: weight_quantization_spec, bias: bias_qspec} - node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=act_quantization_spec, - _annotated=True, - ) - -5. A Toy Example with Resnet18 --------------------------------------------------------- - -After above annotation methods defined with ``QuantizationAnnotation API``, we can now put them together to construct a ``BackendQuantizer`` -and run a `toy example `__ -with ``Torchvision Resnet18``. To better understand the final example, here are the classes and utility -functions that are used in the example: - -- `QuantizationConfig `__ - consists of ``QuantizationSpec`` for activation, weight, and bias separately. -- When annotating the model, - `get_input_act_qspec `__, - `get_output_act_qspec `__, - `get_weight_qspec `__, and - `get_bias_qspec `__ - can be used to get the ``QuantizationSpec`` from ``QuantizationConfig`` for a specific pattern. - -A Note on IR for PT2E Quantization Flow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -IR means the intermediate representation of the model, for example, ``torch`` IR (``torch.nn`` modules, ``torch.nn.functional`` ops) or ``aten`` IR (``torch.ops.aten.linear``, ...). PT2E Quantization Flow is using pre autograd aten IR (the output of `torch.export` API) so that we support training. As is shown before, we need to match the operator or operator patterns before we can attach annotations on them, So the question is how do we match the pattern? - -Motivation: Problem of Matching ``aten`` IR directly --------------------------------------------------------- - -The most straightforward way might be matching ``aten`` IR directly. - -Example:: - - for n in gm.graph.nodes: - if n.op != "call_function" or n.target not in [ - torch.ops.aten.relu.default, - torch.ops.aten.relu_.default, - ]: - continue - relu_node = n - maybe_conv_node = n.args[0] - if ( - not isinstance(maybe_conv_node, Node) - or maybe_conv_node.op != "call_function" - or maybe_conv_node.target - not in [ - torch.ops.aten.conv1d.default, - torch.ops.aten.conv2d.default, - ] - ): - continue - - # annotate conv and relu nodes - ... - -However one problem for using this IR is that the representation might change if the PyTorch implementation for modules or functional ops changed. But this could be unexpected since modeling users typically assume that when the eager mode model code doesn't change, they should get the same model representation after program capture as well. One concrete effect for this problem is that if a ``Quantizer`` do annotations based on recognizing ``aten`` IR patterns, then it may fail to recognzing the pattern after PyTorch version update, and the same eager mode floating point may be left unquantized. - -Recommendation: Use ``SubgraphMatcherWithNameNodeMap`` for pattern matching ------------------------------------------------------------------------------ -Because of this, we recommend people to recognize the pattern through ``SubgraphMatcherWithNameNodeMap`` (an improved version of ``SubgraphMatcher`` that makes it easier to query the nodes that people want to annotate), through capturing a ``torch`` IR pattern (with the same program capture used for capturing the floating point model), instead of using the ``aten`` IR pattern directly. - -Example:: - - def conv_relu_pattern(input, weight, bias): - conv = torch.nn.functional.conv2d(input, weight, bias) - output = torch.nn.functional.relu(conv) - # returns an additional dict that includes a map from name to node that we want to annotate - return relu, {"input": input, "weight": weight, "bias": bias, "output": output} - - matcher = SubgraphMatcherWithNameNodeMap(conv_relu_pattern) - matches = matcher.match(model) - for match in matches: - # find input and output of the pattern - # annotate the nodes - name_node_map = match.name_node_map - input_node = name_node_map["input"] - weight_node = name_node_map["weight"] - bias_node = name_node_map["bias"] - output_node = name_node_map["relu"] - input_node.users[0].meta["quantization_annotation"] = ... - weight_node.users[0].meta["quantization_annotation"] = ... - bias_node.users[0].meta["quantization_annotation"] = ... - output_node.meta["quantization_annotation"] = ... - -With this, the ``Quantizer`` will still be valid even when the implementation for nn modules and functionals changes, the ``aten`` IR for floating point model will change, but since we capture the pattern again instead of hardcoding the ``aten`` IR for the pattern, we'll get the updated ``aten`` IR as well and will still be able to match the pattern. - -One caveat is that if inputs of the pattern has multiple users, we don't have a good way to identify which user node we want to annotate except for checking the aten op target. - -Another caveat is that we need to make sure we have an exhaustive list of examples (e.g. 2D, 3D, 4D inputs, real v.s. symbolic inputs, training=True v.s. training=False etc.) for the pattern to make sure cover different possible ``aten`` IR outcomes captured from the ``torch`` IR pattern. - -Note: We may provide some (pattern, list of example_inputs) or some pre-generated matcher object so people can just use them directly in the future. - -Conclusion -^^^^^^^^^^^^^^^^^^^ - -With this tutorial, we introduce the new quantization path in PyTorch 2. Users can learn about -how to define a ``BackendQuantizer`` with the ``QuantizationAnnotation API`` and integrate it into the PyTorch 2 Export Quantization flow. -Examples of ``QuantizationSpec``, ``SharedQuantizationSpec``, ``FixedQParamsQuantizationSpec``, and ``DerivedQuantizationSpec`` -are given for specific annotation use case. You can use `XNNPACKQuantizer `_ as an example to start implementing your own ``Quantizer``. After that please follow `this tutorial `_ to actually quantize your model. diff --git a/prototype_source/semi_structured_sparse.rst b/prototype_source/semi_structured_sparse.rst deleted file mode 100644 index c7b82fd43cd..00000000000 --- a/prototype_source/semi_structured_sparse.rst +++ /dev/null @@ -1,537 +0,0 @@ -(prototype) Accelerating BERT with semi-structured (2:4) sparsity -================================================================= -**Author**: `Jesse Cai `_ - -Like other forms of sparsity, **semi-structured sparsity** is a model optimization technique that seeks to reduce the memory overhead and latency of a neural network at the expense of some model accuracy. -It is also known as **fine-grained structured sparsity** or **2:4 structured sparsity**. - -Semi-structured sparsity derives its name from its unique sparsity pattern, where n out of every 2n elements are pruned. We most often see n=2, hence 2:4 sparsity -Semi-structured sparsity is particularly interesting because it can be efficiently accelerated on GPUs and doesn't degrade model accuracy as much as other sparsity patterns. - -With the introduction of `semi-structured sparsity support `_, it is possible to prune and accelerate a semi-structured sparse model without leaving PyTorch. -We will explain this process in this tutorial. - -.. image:: ../../_static/img/pruning_flow.jpg - -By the end of this tutorial, we will have sparsified a BERT question-answering model to be 2:4 sparse, fine-tuning it to recover nearly all F1 loss (86.92 dense vs 86.48 sparse). -Finally, we will accelerate this 2:4 sparse model for inference, yielding a 1.3x speedup. - -Requirements --------------- - -* PyTorch >= 2.1. -* A NVIDIA GPU with semi-structured sparsity support (Compute Capability 8.0+). - -.. note:: - - This tutorial is designed for beginners to semi-structured sparsity / sparsity in general. - For users with existing 2:4 sparse models, accelerating ``nn.Linear`` layers for inference with ``to_sparse_semi_structured`` is as easy as: - - .. code:: python - - import torch - from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor - from torch.utils.benchmark import Timer - SparseSemiStructuredTensor._FORCE_CUTLASS = True - - # mask Linear weight to be 2:4 sparse - mask = torch.Tensor([0, 0, 1, 1]).tile((3072, 2560)).cuda().bool() - linear = torch.nn.Linear(10240, 3072).half().cuda().eval() - linear.weight = torch.nn.Parameter(mask * linear.weight) - - x = torch.rand(3072, 10240).half().cuda() - - with torch.inference_mode(): - dense_output = linear(x) - dense_t = Timer(stmt="linear(x)", - globals={"linear": linear, - "x": x}).blocked_autorange().median * 1e3 - - # accelerate via SparseSemiStructuredTensor - linear.weight = torch.nn.Parameter(to_sparse_semi_structured(linear.weight)) - - sparse_output = linear(x) - sparse_t = Timer(stmt="linear(x)", - globals={"linear": linear, - "x": x}).blocked_autorange().median * 1e3 - - # sparse and dense matmul are numerically equivalent - assert torch.allclose(sparse_output, dense_output, atol=1e-3) - print(f"Dense: {dense_t:.3f}ms Sparse: {sparse_t:.3f}ms | Speedup: {(dense_t / sparse_t):.3f}x") - - On an A100 80GB, we see: `Dense: 0.870ms Sparse: 0.630ms | Speedup: 1.382x` - - -What problem does semi-structured sparsity solve? -------------------------------------------------- -The general motivation behind sparsity is simple: if there are zeros in your network, you can avoid storing / doing compute with those parameters. -However, the specifics of sparsity are tricky. Zeroing out parameters doesn't affect the latency / memory overhead of our model out of the box. - -This is because the dense tensor still contains the pruned (zero) elements, which the dense matrix multiplication kernel will still operate on this elements. -In order to realize performance gains, we need to swap out dense kernels for sparse kernels, which skip calculation involving pruned elements. - -To do this, these kernels work on sparse matrices, which do not store the pruned elements and store the specified elements in a compressed format. - -For semi-structured sparsity, we store exactly half of the original parameters along with some compressed metadata about how the elements were arranged. - -.. image:: https://developer-blogs.nvidia.com/wp-content/uploads/2023/06/2-4-structured-sparsity-pattern.png - :align: center - :width: 80% - - Image sourced from `NVIDIA blog post `_ on semi-structured sparsity. - -There are many different sparse layouts, each with their own benefits and drawbacks. The 2:4 semi-structured sparse layout is particularly interesting for two reasons: -1. Unlike previous sparse formats, semi-structured sparsity was designed to be efficiently accelerated on GPUs. - In 2020, NVIDIA introduced hardware support for semi-structured sparsity with their Ampere architecture, and have also released fast sparse kernels via CUTLASS/`cuSPARSELt `_. -2. At the same time, semi-structured sparsity tends to have a milder impact on model accuracy compared to other sparse formats, especially when accounting for more advanced pruning / fine-tuning methods. - NVIDIA has shown in their `white paper `_ that a simple paradigm of magnitude pruning once to be 2:4 sparse and then retraining the model yields nearly identical model accuracies. - -Semi-structured exists in a sweet spot, providing a 2x (theoretical) speedup at a much lower sparsity level (50%), while still being granular enough to preserve model accuracy. - - -+---------------------+-------------+--------+------------+-------------+ -| Network | Data Set | Metric | Dense FP16 | Sparse FP16 | -+=====================+=============+========+============+=============+ -| ResNet-50 | ImageNet | Top-1 | 76.1 | 76.2 | -+---------------------+-------------+--------+------------+-------------+ -| ResNeXt-101_32x8d | ImageNet | Top-1 | 79.3 | 79.3 | -+---------------------+-------------+--------+------------+-------------+ -| Xception | ImageNet | Top-1 | 79.2 | 79.2 | -+---------------------+-------------+--------+------------+-------------+ -| SSD-RN50 | COCO2017 | bbAP | 24.8 | 24.8 | -+---------------------+-------------+--------+------------+-------------+ -| MaskRCNN-RN50 | COCO2017 | bbAP | 37.9 | 37.9 | -+---------------------+-------------+--------+------------+-------------+ -| FairSeq Transformer | EN-DE WMT14 | BLEU | 28.2 | 28.5 | -+---------------------+-------------+--------+------------+-------------+ -| BERT-Large | SQuAD v1.1 | F1 | 91.9 | 91.9 | -+---------------------+-------------+--------+------------+-------------+ - -Semi-structured sparsity has an additional advantage from a workflow perspective. -Because the sparsity level is fixed at 50%, it is easier to decompose the problem of sparsifying a model into two distinct subproblems: - -* Accuracy - How can we find a set of 2:4 sparse weights that minimize the accuracy degradation of our model? -* Performance - How can we accelerate our 2:4 sparse weights for inference and reduced memory overhead? - -.. math:: - \begin{bmatrix} - 1 & 1 & 0 & 0 \\ - 0 & 0 & 1 & 1 \\ - 1 & 0 & 0 & 0 \\ - 0 & 0 & 1 & 1 \\ - \end{bmatrix} - -The natural handoff point between these two problems are zeroed-out dense tensors. Our inference solution is designed to compress and accelerate tensors in this format. -We anticipate many users coming up with custom masking solution, as this is an active area of research. - -Now that we've learned a little more about semi-structured sparsity, let's apply it to a BERT model trained on a question answering task, SQuAD. - -Intro & Setup -------------- -Let's start by importing all the packages we need. - -.. code:: python - - import collections - import datasets - import evaluate - import numpy as np - import torch - import torch.utils.benchmark as benchmark - from torch import nn - from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor - from torch.ao.pruning import WeightNormSparsifier - import transformers - - # force CUTLASS use if cuSPARSELt is not available - SparseSemiStructuredTensor._FORCE_CUTLASS = True - torch.manual_seed(100) - -We'll also need to define some helper functions that are specific to the dataset / task at hand. -These were adapted from `this `_ huggingface course as a reference. - -.. code:: python - - def preprocess_validation_function(examples, tokenizer): - inputs = tokenizer( - [q.strip() for q in examples["question"]], - examples["context"], - max_length=384, - truncation="only_second", - return_overflowing_tokens=True, - return_offsets_mapping=True, - padding="max_length", - ) - sample_map = inputs.pop("overflow_to_sample_mapping") - example_ids = [] - - for i in range(len(inputs["input_ids"])): - sample_idx = sample_map[i] - example_ids.append(examples["id"][sample_idx]) - sequence_ids = inputs.sequence_ids(i) - offset = inputs["offset_mapping"][i] - inputs["offset_mapping"][i] = [ - o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) - ] - - inputs["example_id"] = example_ids - return inputs - - - def preprocess_train_function(examples, tokenizer): - inputs = tokenizer( - [q.strip() for q in examples["question"]], - examples["context"], - max_length=384, - truncation="only_second", - return_offsets_mapping=True, - padding="max_length", - ) - - offset_mapping = inputs["offset_mapping"] - answers = examples["answers"] - start_positions = [] - end_positions = [] - - for i, (offset, answer) in enumerate(zip(offset_mapping, answers)): - start_char = answer["answer_start"][0] - end_char = start_char + len(answer["text"][0]) - sequence_ids = inputs.sequence_ids(i) - - # Find the start and end of the context - idx = 0 - while sequence_ids[idx] != 1: - idx += 1 - context_start = idx - while sequence_ids[idx] == 1: - idx += 1 - context_end = idx - 1 - - # If the answer is not fully inside the context, label it (0, 0) - if offset[context_start][0] > end_char or offset[context_end][1] < start_char: - start_positions.append(0) - end_positions.append(0) - else: - # Otherwise it's the start and end token positions - idx = context_start - while idx <= context_end and offset[idx][0] <= start_char: - idx += 1 - start_positions.append(idx - 1) - - idx = context_end - while idx >= context_start and offset[idx][1] >= end_char: - idx -= 1 - end_positions.append(idx + 1) - - inputs["start_positions"] = start_positions - inputs["end_positions"] = end_positions - return inputs - - - def compute_metrics(start_logits, end_logits, features, examples): - n_best = 20 - max_answer_length = 30 - metric = evaluate.load("squad") - - example_to_features = collections.defaultdict(list) - for idx, feature in enumerate(features): - example_to_features[feature["example_id"]].append(idx) - - predicted_answers = [] - # for example in tqdm(examples): - for example in examples: - example_id = example["id"] - context = example["context"] - answers = [] - - # Loop through all features associated with that example - for feature_index in example_to_features[example_id]: - start_logit = start_logits[feature_index] - end_logit = end_logits[feature_index] - offsets = features[feature_index]["offset_mapping"] - - start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() - end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() - for start_index in start_indexes: - for end_index in end_indexes: - # Skip answers that are not fully in the context - if offsets[start_index] is None or offsets[end_index] is None: - continue - # Skip answers with a length that is either < 0 - # or > max_answer_length - if ( - end_index < start_index - or end_index - start_index + 1 > max_answer_length - ): - continue - - answer = { - "text": context[ - offsets[start_index][0] : offsets[end_index][1] - ], - "logit_score": start_logit[start_index] + end_logit[end_index], - } - answers.append(answer) - - # Select the answer with the best score - if len(answers) > 0: - best_answer = max(answers, key=lambda x: x["logit_score"]) - predicted_answers.append( - {"id": example_id, "prediction_text": best_answer["text"]} - ) - else: - predicted_answers.append({"id": example_id, "prediction_text": ""}) - - theoretical_answers = [ - {"id": ex["id"], "answers": ex["answers"]} for ex in examples - ] - return metric.compute(predictions=predicted_answers, references=theoretical_answers) - -Now that those are defined, we just need one additional helper function, which will help us benchmark our model. - -.. code:: python - - def measure_execution_time(model, batch_sizes, dataset): - dataset_for_model = dataset.remove_columns(["example_id", "offset_mapping"]) - dataset_for_model.set_format("torch") - model.cuda() - batch_size_to_time_sec = {} - for batch_size in batch_sizes: - batch = { - k: dataset_for_model[k][:batch_size].to(model.device) - for k in dataset_for_model.column_names - } - - with torch.inference_mode(): - timer = benchmark.Timer( - stmt="model(**batch)", globals={"model": model, "batch": batch} - ) - p50 = timer.blocked_autorange().median * 1000 - batch_size_to_time_sec[batch_size] = p50 - return batch_size_to_time_sec - - - -We will get started by loading our model and tokenizer, and then setting up our dataset. - -.. code:: python - - # load model - model_name = "bert-base-cased" - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name) - print(f"Loading tokenizer: {model_name}") - print(f"Loading model: {model_name}") - - # set up train and val dataset - squad_dataset = datasets.load_dataset("squad") - tokenized_squad_dataset = {} - tokenized_squad_dataset["train"] = squad_dataset["train"].map( - lambda x: preprocess_train_function(x, tokenizer), batched=True - ) - tokenized_squad_dataset["validation"] = squad_dataset["validation"].map( - lambda x: preprocess_validation_function(x, tokenizer), - batched=True, - remove_columns=squad_dataset["train"].column_names, - ) - data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer) - - -Next, we'll train a quick baseline of our model on SQuAD. This task asks our model to identify spans, or segments of text, in a given context (Wikipedia articles) that answer a given question. -Running the following code gives me an F1 score of 86.9. This is quite close to the reported NVIDIA score and the difference is likely due to BERT-base vs. BERT-large or fine-tuning hyperparams. - -.. code:: python - - training_args = transformers.TrainingArguments( - "trainer", - num_train_epochs=1, - lr_scheduler_type="constant", - per_device_train_batch_size=64, - per_device_eval_batch_size=512, - ) - - trainer = transformers.Trainer( - model, - training_args, - train_dataset=tokenized_squad_dataset["train"], - eval_dataset=tokenized_squad_dataset["validation"], - data_collator=data_collator, - tokenizer=tokenizer, - ) - - trainer.train() - - # batch sizes to compare for eval - batch_sizes = [4, 16, 64, 256] - # 2:4 sparsity require fp16, so we cast here for a fair comparison - with torch.autocast("cuda"): - with torch.inference_mode(): - predictions = trainer.predict(tokenized_squad_dataset["validation"]) - start_logits, end_logits = predictions.predictions - fp16_baseline = compute_metrics( - start_logits, - end_logits, - tokenized_squad_dataset["validation"], - squad_dataset["validation"], - ) - fp16_time = measure_execution_time( - model, - batch_sizes, - tokenized_squad_dataset["validation"], - ) - print("fp16", fp16_baseline) - print("cuda_fp16 time", fp16_time) - - # fp16 {'exact_match': 78.53358561967833, 'f1': 86.9280493093186} - # cuda_fp16 time {4: 10.927572380751371, 16: 19.607915310189128, 64: 73.18846387788653, 256: 286.91255673766136} - -Pruning BERT to be 2:4 sparse ------------------------------ -Now that we have our baseline, it's time we prune BERT. There are many different pruning strategies, but one of the most common is **magnitude pruning**, which seeks to remove the weights -with the lowest L1 norm. Magnitude pruning was used by NVIDIA in all their results and is a common baseline. - -To do this, we will use the ``torch.ao.pruning`` package, which contains a weight-norm (magnitude) sparsifier. -These sparsifiers work by applying mask parameterizations to the weight tensors in a model. This lets them simulate sparsity by masking out the pruned weights. - -We'll also have to decide what layers of the model to apply sparsity to, which in this case is all of the `nn.Linear` layers, except for the task-specific head outputs. -That's because semi-structured sparsity has `shape constraints `_, and the task-specific nn.Linear layers do not satisfy them. - -.. code:: python - - sparsifier = WeightNormSparsifier( - # apply sparsity to all blocks - sparsity_level=1.0, - # shape of 4 elemens is a block - sparse_block_shape=(1, 4), - # two zeros for every block of 4 - zeros_per_block=2 - ) - - # add to config if nn.Linear and in the BERT model. - sparse_config = [ - {"tensor_fqn": f"{fqn}.weight"} - for fqn, module in model.named_modules() - if isinstance(module, nn.Linear) and "layer" in fqn - ] - -The first step for pruning the model is to insert paramterizations for masking the weights of the model. This is done by the prepare step. -Anytime we try to access the ``.weight`` we will get ``mask * weight`` instead. - -.. code:: python - - # Prepare the model, insert fake-sparsity parameterizations for training - sparsifier.prepare(model, sparse_config) - print(model.bert.encoder.layer[0].output) - - # BertOutput( - # (dense): ParametrizedLinear( - # in_features=3072, out_features=768, bias=True - # (parametrizations): ModuleDict( - # (weight): ParametrizationList( - # (0-5): 6 x FakeSparsity() - # ) - # ) - # ) - # (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - # (dropout): Dropout(p=0.1, inplace=False) - # ) - -Then, we'll take a single pruning step. All pruners implement a ``update_mask()`` method that updates the mask with the logic being determined by the pruner implementation. -The step method calls this ``update_mask`` functions for the weights specified in the sparse config. - -We will also evaluate the model to show the accuracy degradation of zero-shot pruning, or pruning without fine-tuning / retraining. - -.. code:: python - - sparsifier.step() - with torch.autocast("cuda"): - with torch.inference_mode(): - predictions = trainer.predict(tokenized_squad_dataset["validation"]) - pruned = compute_metrics( - *predictions.predictions, - tokenized_squad_dataset["validation"], - squad_dataset["validation"], - ) - print("pruned eval metrics:", pruned) - # pruned eval metrics: {'exact_match': 40.59602649006622, 'f1': 56.51610004515979} - -In this state, we can start fine-tuning the model, updating the elements that wouldn't be pruned to better account for the accuracy loss. -Once we've reached a satisfied state, we can call ``squash_mask`` to fuse the mask and the weight together. This will remove the parameterizations and we are left with a zeroed-out 2:4 dense model. - -.. code:: python - - trainer.train() - sparsifier.squash_mask() - torch.set_printoptions(edgeitems=4) - print(model.bert.encoder.layer[0].intermediate.dense.weight) - - # Parameter containing: - # tensor([[ 0.0000, -0.0237, 0.0000, 0.0130, ..., -0.0462, -0.0000, 0.0000, -0.0272], - # [ 0.0436, -0.0000, -0.0000, 0.0492, ..., -0.0000, 0.0844, 0.0340, -0.0000], - # [-0.0302, -0.0350, 0.0000, 0.0000, ..., 0.0303, 0.0175, -0.0000, 0.0000], - # [ 0.0000, -0.0000, -0.0529, 0.0327, ..., 0.0213, 0.0000, -0.0000, 0.0735], - # ..., - # [ 0.0000, -0.0000, -0.0258, -0.0239, ..., -0.0000, -0.0000, 0.0380, 0.0562], - # [-0.0432, -0.0000, 0.0000, -0.0598, ..., 0.0000, -0.0000, 0.0262 -0.0227], - # [ 0.0244, 0.0921, -0.0000, -0.0000, ..., -0.0000, -0.0784, 0.0000, 0.0761], - # [ 0.0000, 0.0225, -0.0395, -0.0000, ..., -0.0000, 0.0684, -0.0344, -0.0000]], device='cuda:0', requires_grad=True) - -Accelerating 2:4 sparse models for inference ---------i------------------------------------ -Now that we have a model in this format, we can accelerate it for inference just like in the QuickStart Guide. - -.. code:: python - - model = model.cuda().half() - # accelerate for sparsity - for fqn, module in model.named_modules(): - if isinstance(module, nn.Linear) and "layer" in fqn: - module.weight = nn.Parameter(to_sparse_semi_structured(module.weight)) - - with torch.inference_mode(): - predictions = trainer.predict(tokenized_squad_dataset["validation"]) - start_logits, end_logits = predictions.predictions - metrics_sparse = compute_metrics( - start_logits, - end_logits, - tokenized_squad_dataset["validation"], - squad_dataset["validation"], - ) - print("sparse eval metrics: ", metrics_sparse) - sparse_perf = measure_execution_time( - model, - batch_sizes, - tokenized_squad_dataset["validation"], - ) - print("sparse perf metrics: ", sparse_perf) - - # sparse eval metrics: {'exact_match': 78.43897824030275, 'f1': 86.48718950090766} - # sparse perf metrics: {4: 12.621004460379481, 16: 15.368514601141214, 64: 58.702805917710066, 256: 244.19364519417286} - -Retraining our model after magnitude pruning has recovered nearly all of the F1 that has been lost when the model was pruned. At the same time we have achieved a 1.28x speedup for bs=16. -Note that not all shapes are amenable to performance improvements. When batch sizes are small and limited time is spent in compute sparse kernels may be slower than their dense counterparts. - -.. table:: results - - +--------------------+----------------+--------------+---------------------+ - | Metrics | fp16 | 2:4 sparse | delta / speedup | - +====================+================+==============+=====================+ - | Exact Match (%) | 78.53 | 78.44 | -0.09 | - +--------------------+----------------+--------------+---------------------+ - | F1 (%) | 86.93 | 86.49 | -0.44 | - +--------------------+----------------+--------------+---------------------+ - | Time (bs=4) | 10.93 | 12.62 | 0.87x | - +--------------------+----------------+--------------+---------------------+ - | Time (bs=16) | 19.61 | 15.37 | 1.28x | - +--------------------+----------------+--------------+---------------------+ - | Time (bs=64) | 73.19 | 58.70 | 1.25x | - +--------------------+----------------+--------------+---------------------+ - | Time (bs=256) | 286.91 | 244.19 | 1.18x | - +--------------------+----------------+--------------+---------------------+ - - -Conclusion ----------- -In this tutorial, we have shown how to prune BERT to be 2:4 sparse and how to accelerate a 2:4 sparse model for inference. -By taking advantage of our SparseSemiStructuredTensor subclass, we were able to achieve a 1.3x speedup over the fp16 baseline. -We also demonstrated the benefits of 2:4 sparsity by fine-tuning BERT to recover any lost F1 (86.92 dense vs 86.48 sparse). diff --git a/prototype_source/skip_param_init.rst b/prototype_source/skip_param_init.rst deleted file mode 100644 index 197877b4c6f..00000000000 --- a/prototype_source/skip_param_init.rst +++ /dev/null @@ -1,127 +0,0 @@ -Skipping Module Parameter Initialization -======================================== - -Introduction ------------- - -When a module is created, its learnable parameters are initialized according -to a default initialization scheme associated with the module type. For example, the `weight` -parameter for a :class:`torch.nn.Linear` module is initialized from a -`uniform(-1/sqrt(in_features), 1/sqrt(in_features))` distribution. If some other initialization -scheme is desired, this has traditionally required re-initializing the parameters -after module instantiation: - -:: - - from torch import nn - - # Initializes weight from the default distribution: uniform(-1/sqrt(10), 1/sqrt(10)). - m = nn.Linear(10, 5) - - # Re-initialize weight from a different distribution. - nn.init.orthogonal_(m.weight) - -In this case, the initialization done during construction is wasted computation, and it may be non-trivial if -the `weight` parameter is large. - -Skipping Initialization ------------------------ - -It is now possible to skip parameter initialization during module construction, avoiding -wasted computation. This is easily accomplished using the :func:`torch.nn.utils.skip_init` function: - -:: - - from torch import nn - from torch.nn.utils import skip_init - - m = skip_init(nn.Linear, 10, 5) - - # Example: Do custom, non-default parameter initialization. - nn.init.orthogonal_(m.weight) - -This can be applied to any module that satisfies the conditions described in the -:ref:`Updating` section below. Note that all modules provided by -`torch.nn` satisfy these conditions and thus support skipping init. - -.. _Updating: - -Updating Modules to Support Skipping Initialization ---------------------------------------------------- - -Due to the way :func:`torch.nn.utils.skip_init` is implemented (see :ref:`Details`), there are -two requirements that a module must meet to be compatible with the function. -You can opt in to the parameter initialization skipping functionality for your custom module -simply by adhering to these requirements: - - 1. The module must accept a `device` kwarg in its constructor that is passed to any parameters - or buffers created during construction. - - 2. The module must not perform any computation on parameters or buffers in its constructor except - initialization (i.e. functions from `torch.nn.init`). - -The following example demonstrates a module updated to support the `device` -kwarg by passing it along to any created parameters, buffers, or submodules: - -:: - - import torch - from torch import nn - - class MyModule(torch.nn.Module): - def __init__(self, foo, bar, device=None): - super().__init__() - - # ==== Case 1: Module creates parameters directly. ==== - # Pass device along to any created parameters. - self.param1 = nn.Parameter(torch.empty((foo, bar), device=device)) - self.register_parameter('param2', nn.Parameter(torch.empty(bar, device=device))) - - # To ensure support for the meta device, avoid using ops except those in - # torch.nn.init on parameters in your module's constructor. - with torch.no_grad(): - nn.init.kaiming_uniform_(self.param1) - nn.init.uniform_(self.param2) - - - # ==== Case 2: Module creates submodules. ==== - # Pass device along recursively. All submodules will need to support - # them as well; this is the case for all torch.nn provided modules. - self.fc = nn.Linear(bar, 5, device=device) - - # This also works with containers. - self.linears = nn.Sequential( - nn.Linear(5, 5, device=device), - nn.Linear(5, 1, device=device) - ) - - - # ==== Case 3: Module creates buffers. ==== - # Pass device along during buffer tensor creation. - self.register_buffer('some_buffer', torch.ones(7, device=device)) - - ... - -.. _Details: - -Implementation Details ----------------------- - -Behind the scenes, the :func:`torch.nn.utils.skip_init` function is implemented in terms of a two-step pattern: - -:: - - # 1. Initialize module on the meta device; all torch.nn.init ops have - # no-op behavior on the meta device. - m = nn.Linear(10, 5, device='meta') - - # 2. Materialize an uninitialized (empty) form of the module on the CPU device. - # The result of this is a module instance with uninitialized parameters. - m.to_empty(device='cpu') - -It works by instantiating the module onto a "meta" device, which has tensor shape information -but does not allocate any storage. The `torch.nn.init` ops are specially implemented for this meta device -so that they have no-op behavior. This results in the parameter intialization logic being essentially skipped. - -Note that this pattern only works for modules that properly support a `device` kwarg during construction, as -described in :ref:`Updating`. diff --git a/prototype_source/torchscript_freezing.py b/prototype_source/torchscript_freezing.py deleted file mode 100644 index ca21451d6e8..00000000000 --- a/prototype_source/torchscript_freezing.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Model Freezing in TorchScript -============================= - -In this tutorial, we introduce the syntax for *model freezing* in TorchScript. -Freezing is the process of inlining Pytorch module parameters and attributes -values into the TorchScript internal representation. Parameter and attribute -values are treated as final values and they cannot be modified in the resulting -Frozen module. - -Basic Syntax ------------- -Model freezing can be invoked using API below: - - ``torch.jit.freeze(mod : ScriptModule, names : str[]) -> ScriptModule`` - -Note the input module can either be the result of scripting or tracing. -See https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html - -Next, we demonstrate how freezing works using an example: -""" - -import torch, time - -class Net(torch.nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = torch.nn.Conv2d(1, 32, 3, 1) - self.conv2 = torch.nn.Conv2d(32, 64, 3, 1) - self.dropout1 = torch.nn.Dropout2d(0.25) - self.dropout2 = torch.nn.Dropout2d(0.5) - self.fc1 = torch.nn.Linear(9216, 128) - self.fc2 = torch.nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = torch.nn.functional.relu(x) - x = self.conv2(x) - x = torch.nn.functional.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = torch.nn.functional.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = torch.nn.functional.log_softmax(x, dim=1) - return output - - @torch.jit.export - def version(self): - return 1.0 - -net = torch.jit.script(Net()) -fnet = torch.jit.freeze(net) - -print(net.conv1.weight.size()) -print(net.conv1.bias) - -try: - print(fnet.conv1.bias) - # without exception handling, prints: - # RuntimeError: __torch__.z.___torch_mangle_3.Net does not have a field - # with name 'conv1' -except RuntimeError: - print("field 'conv1' is inlined. It does not exist in 'fnet'") - -try: - fnet.version() - # without exception handling, prints: - # RuntimeError: __torch__.z.___torch_mangle_3.Net does not have a field - # with name 'version' -except RuntimeError: - print("method 'version' is not deleted in fnet. Only 'forward' is preserved") - -fnet2 = torch.jit.freeze(net, ["version"]) - -print(fnet2.version()) - -B=1 -warmup = 1 -iter = 1000 -input = torch.rand(B, 1,28, 28) - -start = time.time() -for i in range(warmup): - net(input) -end = time.time() -print("Scripted - Warm up time: {0:7.4f}".format(end-start), flush=True) - -start = time.time() -for i in range(warmup): - fnet(input) -end = time.time() -print("Frozen - Warm up time: {0:7.4f}".format(end-start), flush=True) - -start = time.time() -for i in range(iter): - input = torch.rand(B, 1,28, 28) - net(input) -end = time.time() -print("Scripted - Inference: {0:5.2f}".format(end-start), flush=True) - -start = time.time() -for i in range(iter): - input = torch.rand(B, 1,28, 28) - fnet2(input) -end = time.time() -print("Frozen - Inference time: {0:5.2f}".format(end-start), flush =True) - -############################################################### -# On my machine, I measured the time: -# -# * Scripted - Warm up time: 0.0107 -# * Frozen - Warm up time: 0.0048 -# * Scripted - Inference: 1.35 -# * Frozen - Inference time: 1.17 - -############################################################### -# In our example, warm up time measures the first two runs. The frozen model -# is 50% faster than the scripted model. On some more complex models, we -# observed even higher speed up of warm up time. freezing achieves this speed up -# because it is doing some the work TorchScript has to do when the first couple -# runs are initiated. -# -# Inference time measures inference execution time after the model is warmed up. -# Although we observed significant variation in execution time, the -# frozen model is often about 15% faster than the scripted model. When input is larger, -# we observe a smaller speed up because the execution is dominated by tensor operations. - -############################################################### -# Conclusion -# ----------- -# In this tutorial, we learned about model freezing. Freezing is a useful technique to -# optimize models for inference and it also can significantly reduce TorchScript warmup time. diff --git a/prototype_source/tracing_based_selective_build.rst b/prototype_source/tracing_based_selective_build.rst deleted file mode 100644 index 811ca1cf897..00000000000 --- a/prototype_source/tracing_based_selective_build.rst +++ /dev/null @@ -1,201 +0,0 @@ -(prototype) Tracing-based Selective Build Mobile Interpreter in Android and iOS -=============================================================================== - - -*Author*: Chen Lai , Dhruv Matani - -.. warning:: - Tracing-based selective build a prototype feature to minimize library size. Since the traced result relies on the model input and traced environment, if the tracer runs in a different environment than mobile interpreter, the operator list might be different from the actual used operator list and missing operators error might raise. - -Introduction ------------- - - -This tutorial introduces a new way to custom build mobile interpreter to further optimize mobile interpreter size. It restricts the set of operators included in the compiled binary to only the set of operators actually needed by target models. It is a technique to reduce the binary size of PyTorch for mobile deployments. Tracing Based Selective Build runs a model with specific representative inputs, and records which operators were called. The build then includes just those operators. - - -Following are the processes to use tracing-based selective approach to build a custom mobile interpreter. - -1. *Prepare model with bundled input* - -.. code:: python - - import numpy as np - import torch - import torch.jit - import torch.utils - import torch.utils.bundled_inputs - from PIL import Image - from torchvision import transforms - - # Step 1. Get the model - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scripted_module = torch.jit.script(model) - # Export full jit version model (not compatible lite interpreter), leave it here for comparison - scripted_module.save("deeplabv3_scripted.pt") - # Export lite interpreter version model (compatible with lite interpreter) - # path = "" - - scripted_module._save_for_lite_interpreter(f"${path}/deeplabv3_scripted.ptl") - - model_file = f"${path}/deeplabv3_scripted.ptl" - - # Step 2. Prepare inputs for the model - input_image_1 = Image.open(f"${path}/dog.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor_1 = preprocess(input_image_1) - input_batch_1 = input_tensor_1.unsqueeze(0) # create a mini-batch as expected by the model - - scripted_module = torch.jit.load(model_file) - scripted_module.forward(input_batch_1) # optional, to validate the model can run with the input_batch_1 - - input_image_2 = Image.open(f"${path}/deeplab.jpg") - input_tensor_2 = preprocess(input_image_2) - input_batch_2 = input_tensor_2.unsqueeze(0) # create a mini-batch as expected by the model - - scripted_module = torch.jit.load(model_file) - scripted_module.forward(input_batch_2) # optional, to validate the model can run with the input_batch_2 - - # Step 3. Bundle the model with the prepared input from step2. Can bundle as many input as possible. - bundled_model_input = [ - (torch.utils.bundled_inputs.bundle_large_tensor(input_batch_1), ), - (torch.utils.bundled_inputs.bundle_large_tensor(input_batch_2), )] - bundled_model = torch.utils.bundled_inputs.bundle_inputs(scripted_module, bundled_model_input) - bundled_model._save_for_lite_interpreter(f"${path}/deeplabv3_scripted_with_bundled_input.ptl") - -2. Build tracer - -.. code:: shell - - MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ MAX_JOBS=16 TRACING_BASED=1 python setup.py develop - -3. Run tracer with the model with bundled input - -.. code:: shell - - ./build/bin/model_tracer --model_input_path ${path}/deeplabv3_scripted_with_bundled_input.ptl --build_yaml_path ${path}/deeplabv3_scripted.yaml - - - -Android -------- - -Get the Image Segmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation - -1. **Tracing-based build libtorch lite for android**: Build libtorch for android for all 4 android abis (``armeabi-v7a``, ``arm64-v8a``, ``x86``, ``x86_64``) by running - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 ./scripts/build_pytorch_android.sh - -if it will be tested on Pixel 4 emulator with ``x86``, use cmd ``BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86`` to specify abi to save build time. - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 ./scripts/build_pytorch_android.sh x86 - - -After the build finish, it will show the library path: - -.. code-block:: bash - - BUILD SUCCESSFUL in 55s - 134 actionable tasks: 22 executed, 112 up-to-date - + find /Users/chenlai/pytorch/android -type f -name '*aar' - + xargs ls -lah - -rw-r--r-- 1 chenlai staff 13M Feb 11 11:48 /Users/chenlai/pytorch/android/pytorch_android/build/outputs/aar/pytorch_android-release.aar - -rw-r--r-- 1 chenlai staff 36K Feb 9 16:45 /Users/chenlai/pytorch/android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar - -2. **Use the PyTorch Android libraries built from source in the ImageSegmentation app**: Create a folder `libs` in the path, the path from repository root will be `ImageSegmentation/app/libs`. Copy `pytorch_android-release` to the path ``ImageSegmentation/app/libs/pytorch_android-release.aar``. Copy `pytorch_android_torchvision` (downloaded from `Pytorch Android Torchvision Nightly `_) to the path ``ImageSegmentation/app/libs/pytorch_android_torchvision.aar``. Update the `dependencies` part of ``ImageSegmentation/app/build.gradle`` to - -.. code:: gradle - - dependencies { - implementation 'androidx.appcompat:appcompat:1.2.0' - implementation 'androidx.constraintlayout:constraintlayout:2.0.2' - testImplementation 'junit:junit:4.12' - androidTestImplementation 'androidx.test.ext:junit:1.1.2' - androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' - - - implementation(name:'pytorch_android-release', ext:'aar') - implementation(name:'pytorch_android_torchvision', ext:'aar') - - implementation 'com.android.support:appcompat-v7:28.0.0' - implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' - } - -Update `all projects` part in ``ImageSegmentation/build.gradle`` to - - -.. code:: gradle - - allprojects { - repositories { - google() - jcenter() - flatDir { - dirs 'libs' - } - } - } - - -3. **Test app**: Build and run the `ImageSegmentation` app in Android Studio - - -iOS ---- - -Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation - - -1. **Build libtorch lite for iOS**: - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 IOS_PLATFORM=SIMULATOR ./scripts/build_ios.sh - - -2. **Remove Cocoapods from the project** (this step is only needed if you ran `pod install`): - - -.. code-block:: bash - - pod deintegrate - - -3. **Link ImageSegmentation demo app with the custom built library**: - -Open your project in XCode, go to your project Target’s **Build Phases - Link Binaries With Libraries**, click the **+** sign and add all the library files located in `build_ios/install/lib`. Navigate to the project **Build Settings**, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`. -In the build settings, search for **other linker flags**. Add a custom linker flag below `-all_load`. -Finally, disable bitcode for your target by selecting the Build Settings, searching for Enable Bitcode, and set the value to **No**. - - -4. **Build and test the app in Xcode.** - - - -Conclusion ----------- - -In this tutorial, we demonstrated a new way to custom build PyTorch's efficient mobile interpreter - tracing-based selective build, in an Android and iOS app. - -We walked through an Image Segmentation example to show how to bundle inputs to a model, generated operator list by tracing the model with bundled input, and build a custom torch library from source with the operator list from tracing result. - -The custom build is still under development, and we will continue improving its size in the future. Note, however, that the APIs are subject to change in future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue here `. - -Learn More - - -- To learn more about PyTorch Mobile, please refer to PyTorch Mobile Home Page - -* To learn more about Image Segmentation, please refer to the Image Segmentation DeepLabV3 on Android Recipe _ diff --git a/prototype_source/vulkan_workflow.rst b/prototype_source/vulkan_workflow.rst deleted file mode 100644 index 2f78ac97d74..00000000000 --- a/prototype_source/vulkan_workflow.rst +++ /dev/null @@ -1,247 +0,0 @@ -PyTorch Vulkan Backend User Workflow -==================================== - -**Author**: `Ivan Kobzarev `_ - -Introduction ------------- -PyTorch 1.7 supports the ability to run model inference on GPUs that support the Vulkan graphics and compute API. The primary target devices are mobile GPUs on Android devices. The Vulkan backend can also be used on Linux, Mac, and Windows desktop builds to use Vulkan devices like Intel integrated GPUs. This feature is in the prototype stage and is subject to change. - -Building PyTorch with Vulkan backend -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Vulkan backend is not included by default. The main switch to include Vulkan backend is cmake option ``USE_VULKAN``, that can be set by environment variable ``USE_VULKAN``. - -To use PyTorch with Vulkan backend, we need to build it from source with additional settings. Checkout the PyTorch source code from GitHub master branch. - -Optional usage of vulkan wrapper -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -By default, Vulkan library will be loaded at runtime using the vulkan_wrapper library. If you specify the environment variable ``USE_VULKAN_WRAPPER=0`` libvulkan will be linked directly. - -Desktop build -^^^^^^^^^^^^^ - -Vulkan SDK -^^^^^^^^^^ -Download VulkanSDK from https://vulkan.lunarg.com/sdk/home and set environment variable ``VULKAN_SDK`` - -Unpack VulkanSDK to ``VULKAN_SDK_ROOT`` folder, install VulkanSDK following VulkanSDK instructions for your system. - -For Mac: - -:: - - cd $VULKAN_SDK_ROOT - source setup-env.sh - sudo python install_vulkan.py - - -Building PyTorch: - -For Linux: - -:: - - cd PYTORCH_ROOT - USE_VULKAN=1 USE_VULKAN_SHADERC_RUNTIME=1 USE_VULKAN_WRAPPER=0 python setup.py install - -For Mac: - -:: - - cd PYTORCH_ROOT - USE_VULKAN=1 USE_VULKAN_SHADERC_RUNTIME=1 USE_VULKAN_WRAPPER=0 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install - -After successful build, open another terminal and verify the version of installed PyTorch. - -:: - - import torch - print(torch.__version__) - -At the time of writing of this recipe, the version is 1.8.0a0+41237a4. You might be seeing different numbers depending on when you check out the code from master, but it should be greater than 1.7.0. - - -Android build -^^^^^^^^^^^^^ - -To build LibTorch for android with Vulkan backend for specified ``ANDROID_ABI``. - -:: - - cd PYTORCH_ROOT - ANDROID_ABI=arm64-v8a USE_VULKAN=1 sh ./scripts/build_android.sh - - -To prepare pytorch_android aars that you can use directly in your app: - -:: - - cd $PYTORCH_ROOT - USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh - - -Model preparation ------------------ - -Install torchvision, get the default pretrained float model. - -:: - - pip install torchvision - -Python script to save pretrained mobilenet_v2 to a file: - -:: - - import torch - import torchvision - - model = torchvision.models.mobilenet_v2(pretrained=True) - model.eval() - script_model = torch.jit.script(model) - torch.jit.save(script_model, "mobilenet2.pt") - -PyTorch 1.7 Vulkan backend supports only float 32bit operators. The default model needs additional step that will optimize operators fusing - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - script_model_vulkan = optimize_for_mobile(script_model, backend='vulkan') - torch.jit.save(script_model_vulkan, "mobilenet2-vulkan.pt") - -The result model can be used only on Vulkan backend as it contains specific to the Vulkan backend operators. - -By default, ``optimize_for_mobile`` with ``backend='vulkan'`` rewrites the graph so that inputs are transferred to the Vulkan backend, and outputs are transferred to the CPU backend, therefore, the model can be run on CPU inputs and produce CPU outputs. To disable this, add the argument ``optimization_blocklist={MobileOptimizerType.VULKAN_AUTOMATIC_GPU_TRANSFER}`` to ``optimize_for_mobile``. (``MobileOptimizerType`` can be imported from ``torch.utils.mobile_optimizer``) - -For more information, see the `torch.utils.mobile_optimizer` `API documentation `_. - -Using Vulkan backend in code ----------------------------- - -C++ API -------- - -:: - - at::is_vulkan_available() - auto tensor = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto tensor_vulkan = t.vulkan(); - auto module = torch::jit::load("$PATH"); - auto tensor_output_vulkan = module.forward(inputs).toTensor(); - auto tensor_output = tensor_output.cpu(); - -``at::is_vulkan_available()`` function tries to initialize Vulkan backend and if Vulkan device is successfully found and context is created - it will return true, false otherwise. - -``.vulkan()`` function called on Tensor will copy tensor to Vulkan device, and for operators called with this tensor as input - the operator will run on Vulkan device, and its output will be on the Vulkan device. - -``.cpu()`` function called on Vulkan tensor will copy its data to CPU tensor (default) - -Operators called with a tensor on a Vulkan device as an input will be executed on a Vulkan device. If an operator is not supported for the Vulkan backend the exception will be thrown. - -List of supported operators: - -:: - - _adaptive_avg_pool2d - _cat - add.Scalar - add.Tensor - add_.Tensor - addmm - avg_pool2d - clamp - convolution - empty.memory_format - empty_strided - hardtanh_ - max_pool2d - mean.dim - mm - mul.Scalar - relu_ - reshape - select.int - slice.Tensor - transpose.int - transpose_ - unsqueeze - upsample_nearest2d - view - -Those operators allow to use torchvision models for image classification on Vulkan backend. - - -Python API ----------- - -``torch.is_vulkan_available()`` is exposed to Python API. - -``tensor.to(device='vulkan')`` works as ``.vulkan()`` moving tensor to the Vulkan device. - -``.vulkan()`` at the moment of writing of this tutorial is not exposed to Python API, but it is planned to be there. - -Android Java API ----------------- - -For Android API to run model on Vulkan backend we have to specify this during model loading: - -:: - - import org.pytorch.Device; - Module module = Module.load("$PATH", Device.VULKAN) - FloatBuffer buffer = Tensor.allocateFloatBuffer(1 * 3 * 224 * 224); - Tensor inputTensor = Tensor.fromBlob(buffer, new int[]{1, 3, 224, 224}); - Tensor outputTensor = mModule.forward(IValue.from(inputTensor)).toTensor(); - -In this case, all inputs will be transparently copied from CPU to the Vulkan device, and model will be run on Vulkan device, the output will be copied transparently to CPU. - -The example of using Vulkan backend can be found in test application within the PyTorch repository: -https://github.com/pytorch/pytorch/blob/master/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java#L133 - -Building android test app with Vulkan -------------------------------------- - -1. Build pytorch android with Vulkan backend for all android ABIs -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - cd $PYTORCH_ROOT - USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh - -Or if you need only specific abi you can set it as an argument: - -:: - - cd $PYTORCH_ROOT - USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh $ANDROID_ABI - -2. Add vulkan model to test application assets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Add prepared model ``mobilenet2-vulkan.pt`` to test applocation assets: - -:: - - cp mobilenet2-vulkan.pt $PYTORCH_ROOT/android/test_app/app/src/main/assets/ - - -3. Build and Install test applocation to connected android device -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - cd $PYTORCH_ROOT - gradle -p android test_app:installMbvulkanLocalBaseDebug - -After successful installation, the application with the name 'MBQ' can be launched on the device. - - - - - -Testing models without uploading to android device --------------------------------------------------- - -Software implementations of Vulkan (e.g. https://swiftshader.googlesource.com/SwiftShader ) can be used to test if a model can be run using PyTorch Vulkan Backend (e.g. check if all model operators are supported). diff --git a/recipes_index.rst b/recipes_index.rst new file mode 100644 index 00000000000..b8cb9089e44 --- /dev/null +++ b/recipes_index.rst @@ -0,0 +1,356 @@ +Recipes +======== + +Recipes are bite-sized, actionable examples of +how to use specific PyTorch features, different +from our full-length tutorials. + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add recipe cards below this line + +.. Basics + +.. customcarditem:: + :header: Defining a Neural Network + :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network for the MNIST dataset. + :image: _static/img/thumbnails/cropped/defining-a-network.PNG + :link: recipes/recipes/defining_a_neural_network.html + :tags: Basics + +.. customcarditem:: + :header: What is a state_dict in PyTorch + :card_description: Learn how state_dict objects and Python dictionaries are used in saving or loading models from PyTorch. + :image: _static/img/thumbnails/cropped/what-is-a-state-dict.PNG + :link: recipes/recipes/what_is_state_dict.html + :tags: Basics + + +.. customcarditem:: + :header: Warmstarting model using parameters from a different model in PyTorch + :card_description: Learn how warmstarting the training process by partially loading a model or loading a partial model can help your model converge much faster than training from scratch. + :image: _static/img/thumbnails/cropped/warmstarting-models.PNG + :link: recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html + :tags: Basics + +.. customcarditem:: + :header: Zeroing out gradients in PyTorch + :card_description: Learn when you should zero out gradients and how doing so can help increase the accuracy of your model. + :image: _static/img/thumbnails/cropped/zeroing-out-gradients.PNG + :link: recipes/recipes/zeroing_out_gradients.html + :tags: Basics + +.. customcarditem:: + :header: PyTorch Benchmark + :card_description: Learn how to use PyTorch's benchmark module to measure and compare the performance of your code + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/recipes/benchmark.html + :tags: Basics + +.. customcarditem:: + :header: PyTorch Benchmark (quick start) + :card_description: Learn how to measure snippet run times and collect instructions. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/recipes/timer_quick_start.html + :tags: Basics + +.. customcarditem:: + :header: PyTorch Profiler + :card_description: Learn how to use PyTorch's profiler to measure operators time and memory consumption + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/recipes/profiler_recipe.html + :tags: Basics + +.. customcarditem:: + :header: PyTorch Profiler with Instrumentation and Tracing Technology API (ITT API) support + :card_description: Learn how to use PyTorch's profiler with Instrumentation and Tracing Technology API (ITT API) to visualize operators labeling in Intel® VTune™ Profiler GUI + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/profile_with_itt.html + :tags: Basics + +.. customcarditem:: + :header: Dynamic Compilation Control with ``torch.compiler.set_stance`` + :card_description: Learn how to use torch.compiler.set_stance + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compiler_set_stance_tutorial.html + :tags: Compiler + +.. customcarditem:: + :header: Reasoning about Shapes in PyTorch + :card_description: Learn how to use the meta device to reason about shapes in your model. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/recipes/reasoning_about_shapes.html + :tags: Basics + +.. customcarditem:: + :header: Tips for Loading an nn.Module from a Checkpoint + :card_description: Learn tips for loading an nn.Module from a checkpoint. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/recipes/module_load_state_dict_tips.html + :tags: Basics + +.. customcarditem:: + :header: (beta) Using TORCH_LOGS to observe torch.compile + :card_description: Learn how to use the torch logging APIs to observe the compilation process. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_logs.html + :tags: Basics + +.. customcarditem:: + :header: Extension points in nn.Module for loading state_dict and tensor subclasses + :card_description: New extension points in nn.Module. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/recipes/swap_tensors.html + :tags: Basics + +.. customcarditem:: + :header: torch.export AOTInductor Tutorial for Python runtime + :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_export_aoti_python.html + :tags: Basics + +.. customcarditem:: + :header: Demonstration of torch.export flow, common challenges and the solutions to address them + :card_description: Learn how to export models for popular usecases + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_export_challenges_solutions.html + :tags: Compiler,TorchCompile + +.. Interpretability + +.. customcarditem:: + :header: Model Interpretability using Captum + :card_description: Learn how to use Captum attribute the predictions of an image classifier to their corresponding image features and visualize the attribution results. + :image: _static/img/thumbnails/cropped/model-interpretability-using-captum.png + :link: recipes/recipes/Captum_Recipe.html + :tags: Interpretability,Captum + +.. customcarditem:: + :header: How to use TensorBoard with PyTorch + :card_description: Learn basic usage of TensorBoard with PyTorch, and how to visualize data in TensorBoard UI + :image: _static/img/thumbnails/tensorboard_scalars.png + :link: recipes/recipes/tensorboard_with_pytorch.html + :tags: Visualization,TensorBoard + +.. Automatic Mixed Precision + +.. customcarditem:: + :header: Automatic Mixed Precision + :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs. + :image: _static/img/thumbnails/cropped/amp.png + :link: recipes/recipes/amp_recipe.html + :tags: Model-Optimization + +.. Performance + +.. customcarditem:: + :header: Performance Tuning Guide + :card_description: Tips for achieving optimal performance. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/recipes/tuning_guide.html + :tags: Model-Optimization + +.. customcarditem:: + :header: Optimizing CPU Performance on Intel® Xeon® with run_cpu Script + :card_description: How to use run_cpu script for optimal runtime configurations on Intel® Xeon CPUs. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/xeon_run_cpu.html + :tags: Model-Optimization + + +.. (beta) Utilizing Torch Function modes with torch.compile + +.. customcarditem:: + :header: (beta) Utilizing Torch Function modes with torch.compile + :card_description: Override torch operators with Torch Function modes and torch.compile + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_torch_function_modes.html + :tags: Model-Optimization + +.. (beta) Compiling the Optimizer with torch.compile + +.. customcarditem:: + :header: (beta) Compiling the Optimizer with torch.compile + :card_description: Speed up the optimizer using torch.compile + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/compiling_optimizer.html + :tags: Model-Optimization + +.. (beta) Running the compiled optimizer with an LR Scheduler + +.. customcarditem:: + :header: (beta) Running the compiled optimizer with an LR Scheduler + :card_description: Speed up training with LRScheduler and torch.compiled optimizer + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/compiling_optimizer_lr_scheduler.html + :tags: Model-Optimization + +.. (beta) Explicit horizontal fusion with foreach_map and torch.compile +.. customcarditem:: + :header: (beta) Explicit horizontal fusion with foreach_map and torch.compile + :card_description: Horizontally fuse pointwise ops with torch.compile + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/foreach_map.py + :tags: Model-Optimization + +.. Using User-Defined Triton Kernels with ``torch.compile`` + +.. customcarditem:: + :header: Using User-Defined Triton Kernels with ``torch.compile`` + :card_description: Learn how to use user-defined kernels with ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_user_defined_triton_kernel_tutorial.html + :tags: Model-Optimization + +.. Compile Time Caching in ``torch.compile`` + +.. customcarditem:: + :header: Compile Time Caching in ``torch.compile`` + :card_description: Learn how to use compile time caching in ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_caching_tutorial.html + :tags: Model-Optimization + +.. Compile Time Caching Configurations + +.. customcarditem:: + :header: Compile Time Caching Configurations + :card_description: Learn how to configure compile time caching in ``torch.compile`` + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/torch_compile_caching_configuration_tutorial.html + :tags: Model-Optimization + +.. Reducing Cold Start Compilation Time with Regional Compilation + +.. customcarditem:: + :header: Reducing torch.compile cold start compilation time with regional compilation + :card_description: Learn how to use regional compilation to control cold start compile time + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/regional_compilation.html + :tags: Model-Optimization + +.. Intel(R) Neural Compressor for PyTorch* + +.. customcarditem:: + :header: Intel® Neural Compressor for PyTorch + :card_description: Ease-of-use quantization for PyTorch with Intel® Neural Compressor. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/intel_neural_compressor_for_pytorch.html + :tags: Quantization,Model-Optimization + +.. Distributed Training + +.. customcarditem:: + :header: Getting Started with DeviceMesh + :card_description: Learn how to use DeviceMesh + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/distributed_device_mesh.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Shard Optimizer States with ZeroRedundancyOptimizer + :card_description: How to use ZeroRedundancyOptimizer to reduce memory consumption. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/zero_redundancy_optimizer.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Direct Device-to-Device Communication with TensorPipe RPC + :card_description: How to use RPC with direct GPU-to-GPU communication. + :image: _static/img/thumbnails/cropped/profiler.png + :link: recipes/cuda_rpc.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Getting Started with Distributed Checkpoint (DCP) + :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package. + :image: _static/img/thumbnails/cropped/Getting-Started-with-DCP.png + :link: recipes/distributed_checkpoint_recipe.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Asynchronous Checkpointing (DCP) + :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package. + :image: _static/img/thumbnails/cropped/Getting-Started-with-DCP.png + :link: recipes/distributed_async_checkpoint_recipe.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Getting Started with CommDebugMode + :card_description: Learn how to use CommDebugMode for DTensors + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/distributed_comm_debug_mode.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Reducing AoT cold start compilation time with regional compilation + :card_description: Learn how to use regional compilation to control AoT cold start compile time + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: recipes/regional_aot.html + :tags: Model-Optimization + +.. End of tutorial card section + +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- +.. toctree:: + :maxdepth: 2 + :hidden: + + recipes/recipes/defining_a_neural_network + recipes/torch_logs + recipes/recipes/what_is_state_dict + recipes/recipes/warmstarting_model_using_parameters_from_a_different_model + recipes/recipes/zeroing_out_gradients + recipes/recipes/profiler_recipe + recipes/recipes/profile_with_itt + recipes/recipes/Captum_Recipe + recipes/recipes/tensorboard_with_pytorch + recipes/recipes/dynamic_quantization + recipes/recipes/amp_recipe + recipes/recipes/tuning_guide + recipes/recipes/xeon_run_cpu + recipes/compiling_optimizer + recipes/recipes/timer_quick_start + recipes/zero_redundancy_optimizer + recipes/distributed_comm_debug_mode + recipes/torch_export_challenges_solutions + recipes/recipes/benchmark + recipes/recipes/module_load_state_dict_tips + recipes/recipes/reasoning_about_shapes + recipes/recipes/swap_tensors + recipes/torch_export_aoti_python + recipes/recipes/tensorboard_with_pytorch + recipes/torch_compile_torch_function_modes + recipes/compiling_optimizer_lr_scheduler + recipes/foreach_map + recipes/torch_compile_user_defined_triton_kernel_tutorial + recipes/torch_compile_caching_tutorial + recipes/torch_compile_caching_configuration_tutorial + recipes/regional_compilation + recipes/regional_aot + recipes/intel_neural_compressor_for_pytorch + recipes/distributed_device_mesh + recipes/distributed_checkpoint_recipe + recipes/distributed_async_checkpoint_recipe diff --git a/recipes_source/README.txt b/recipes_source/README.txt index f6e4a5ca91a..bbc30dc3a6d 100644 --- a/recipes_source/README.txt +++ b/recipes_source/README.txt @@ -2,6 +2,6 @@ Recipes ------------------ 1. recipes/* and recipes_index.rst PyTorch Recipes - https://pytorch.org/tutorials/recipes/recipes_index.html + https://pytorch.org/tutorials/recipes_index.html diff --git a/recipes_source/amx.rst b/recipes_source/amx.rst deleted file mode 100644 index 459e7c5541b..00000000000 --- a/recipes_source/amx.rst +++ /dev/null @@ -1,134 +0,0 @@ -============================================== -Leverage Intel® Advanced Matrix Extensions -============================================== - -Introduction -============ - -Advanced Matrix Extensions (AMX), also known as Intel® Advanced Matrix Extensions (Intel® AMX), is an x86 extension, -which introduce two new components: a 2-dimensional register file called 'tiles' and an accelerator of Tile Matrix Multiplication (TMUL) that is able to operate on those tiles. -AMX is designed to work on matrices to accelerate deep-learning training and inference on the CPU and is ideal for workloads like natural-language processing, recommendation systems and image recognition. - -Intel advances AI capabilities with 4th Gen Intel® Xeon® Scalable processors and Intel® AMX, delivering 3x to 10x higher inference and training performance versus the previous generation, see `Accelerate AI Workloads with Intel® AMX`_. -Compared to 3rd Gen Intel Xeon Scalable processors running Intel® Advanced Vector Extensions 512 Neural Network Instructions (Intel® AVX-512 VNNI), -4th Gen Intel Xeon Scalable processors running Intel AMX can perform 2,048 INT8 operations per cycle, rather than 256 INT8 operations per cycle. They can also perform 1,024 BF16 operations per cycle, as compared to 64 FP32 operations per cycle, see page 4 of `Accelerate AI Workloads with Intel® AMX`_. -For more detailed information of AMX, see `Intel® AMX Overview`_. - - -AMX in PyTorch -============== - -PyTorch leverages AMX for computing intensive operators with BFloat16 and quantization with INT8 by its backend oneDNN -to get higher performance out-of-box on x86 CPUs with AMX support. -For more detailed information of oneDNN, see `oneDNN`_. - -The operation is fully handled by oneDNN according to the execution code path generated. For example, when a supported operation gets executed into oneDNN implementation on a hardware platform with AMX support, AMX instructions will be invoked automatically inside oneDNN. -Since oneDNN is the default acceleration library for PyTorch CPU, no manual operations are required to enable the AMX support. - -Guidelines of leveraging AMX with workloads -------------------------------------------- - -This section provides guidelines on how to leverage AMX with various workloads. - -- BFloat16 data type: - - - Using ``torch.cpu.amp`` or ``torch.autocast("cpu")`` would utilize AMX acceleration for supported operators. - - :: - - model = model.to(memory_format=torch.channels_last) - with torch.cpu.amp.autocast(): - output = model(input) - -.. note:: Use ``torch.channels_last`` memory format to get better performance. - -- Quantization: - - - Applying quantization would utilize AMX acceleration for supported operators. - -- torch.compile: - - - When the generated graph model runs into oneDNN implementations with the supported operators, AMX accelerations will be activated. - -.. note:: When using PyTorch on CPUs that support AMX, the framework will automatically enable AMX usage by default. This means that PyTorch will attempt to leverage the AMX feature whenever possible to speed up matrix multiplication operations. However, it's important to note that the decision to dispatch to the AMX kernel ultimately depends on the internal optimization strategy of the oneDNN library and the quantization backend, which PyTorch relies on for performance enhancements. The specific details of how AMX utilization is handled internally by PyTorch and the oneDNN library may be subject to change with updates and improvements to the framework. - - -CPU operators that can leverage AMX: ------------------------------------- - -BF16 CPU ops that can leverage AMX: - -- ``conv1d`` -- ``conv2d`` -- ``conv3d`` -- ``conv_transpose1d`` -- ``conv_transpose2d`` -- ``conv_transpose3d`` -- ``bmm`` -- ``mm`` -- ``baddbmm`` -- ``addmm`` -- ``addbmm`` -- ``linear`` -- ``matmul`` - -Quantization CPU ops that can leverage AMX: - -- ``conv1d`` -- ``conv2d`` -- ``conv3d`` -- ``conv_transpose1d`` -- ``conv_transpose2d`` -- ``conv_transpose3d`` -- ``linear`` - - - -Confirm AMX is being utilized ------------------------------- - -Set environment variable ``export ONEDNN_VERBOSE=1``, or use ``torch.backends.mkldnn.verbose`` to enable oneDNN to dump verbose messages. - -:: - - with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON): - with torch.cpu.amp.autocast(): - model(input) - -For example, get oneDNN verbose: - -:: - - onednn_verbose,info,oneDNN v2.7.3 (commit 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) - onednn_verbose,info,cpu,runtime:OpenMP,nthr:128 - onednn_verbose,info,cpu,isa:Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and Intel AMX with bfloat16 and 8-bit integer support - onednn_verbose,info,gpu,runtime:none - onednn_verbose,info,prim_template:operation,engine,primitive,implementation,prop_kind,memory_descriptors,attributes,auxiliary,problem_desc,exec_time - onednn_verbose,exec,cpu,reorder,simple:any,undef,src_f32::blocked:a:f0 dst_f32::blocked:a:f0,attr-scratchpad:user ,,2,5.2561 - ... - onednn_verbose,exec,cpu,convolution,jit:avx512_core_amx_bf16,forward_training,src_bf16::blocked:acdb:f0 wei_bf16:p:blocked:ABcd16b16a2b:f0 bia_f32::blocked:a:f0 dst_bf16::blocked:acdb:f0,attr-scratchpad:user ,alg:convolution_direct,mb7_ic2oc1_ih224oh111kh3sh2dh1ph1_iw224ow111kw3sw2dw1pw1,0.628906 - ... - onednn_verbose,exec,cpu,matmul,brg:avx512_core_amx_int8,undef,src_s8::blocked:ab:f0 wei_s8:p:blocked:BA16a64b4a:f0 dst_s8::blocked:ab:f0,attr-scratchpad:user ,,1x30522:30522x768:1x768,7.66382 - ... - -If you get the verbose of ``avx512_core_amx_bf16`` for BFloat16 or ``avx512_core_amx_int8`` for quantization with INT8, it indicates that AMX is activated. - - -Conclusion ----------- - - -In this tutorial, we briefly introduced AMX, how to utilize AMX in PyTorch to accelerate workloads, and how to confirm that AMX is being utilized. - -With the improvements and updates of PyTorch and oneDNN, the utilization of AMX may be subject to change accordingly. - -As always, if you run into any problems or have any questions, you can use -`forum `_ or `GitHub issues -`_ to get in touch. - - -.. _Accelerate AI Workloads with Intel® AMX: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html - -.. _Intel® AMX Overview: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/overview.html - -.. _oneDNN: https://oneapi-src.github.io/oneDNN/index.html diff --git a/recipes_source/android_native_app_with_custom_op.rst b/recipes_source/android_native_app_with_custom_op.rst index c03940b21ff..c9dbc093b21 100644 --- a/recipes_source/android_native_app_with_custom_op.rst +++ b/recipes_source/android_native_app_with_custom_op.rst @@ -1,735 +1,10 @@ Making Native Android Application that uses PyTorch prebuilt libraries ====================================================================== -**Author**: `Ivan Kobzarev `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -In this recipe, you will learn: +Redirecting in 3 seconds... - - How to make an Android Application that uses LibTorch API from native code (C++). +.. raw:: html - - How to use within this application TorchScript models with custom operators. - -The full setup of this app you can find in `PyTorch Android Demo Application Repository `_. - - -Setup -~~~~~ - -You will need a Python 3 environment with the following packages (and their dependencies) installed: - -- PyTorch 1.6 - -For Android development, you will need to install: - -- Android NDK - -:: - - wget https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip - unzip android-ndk-r19c-linux-x86_64.zip - export ANDROID_NDK=$(pwd)/android-ndk-r19c - - -- Android SDK - -:: - - wget https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip - unzip sdk-tools-linux-3859397.zip -d android_sdk - export ANDROID_HOME=$(pwd)/android_sdk - - - -- Gradle 4.10.3 - -Gradle is the most widely used build system for android applications, and we will need it to build our application. Download it and add to the path to use ``gradle`` in the command line. - -.. code-block:: shell - - wget https://services.gradle.org/distributions/gradle-4.10.3-bin.zip - unzip gradle-4.10.3-bin.zip - export GRADLE_HOME=$(pwd)/gradle-4.10.3 - export PATH="${GRADLE_HOME}/bin/:${PATH}" - -- JDK - -Gradle requires JDK, you need to install it and set environment variable ``JAVA_HOME`` to point to it. -For example you can install OpenJDK, following `instructions `_. - -- OpenCV SDK for Android - -Our custom operator will be implemented using the OpenCV library. To use it for Android, we need to download OpenCV SDK for Android with prebuilt libraries. -Download from `OpenCV releases page `_. Unzip it and set the environment variable ``OPENCV_ANDROID_SDK`` to it. - - -Preparing TorchScript Model With Custom C++ Operator -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TorchScript allows using custom C++ operators, to read about it with details you can read -`the dedicated tutorial `_. - -As a result, you can script the model that uses custom op, that uses OpenCV ``cv::warpPerspective`` function. - -.. code-block:: python - - import torch - import torch.utils.cpp_extension - - print(torch.version.__version__) - op_source = """ - #include - #include - - torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data_ptr()); - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data_ptr()); - - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{64, 64}); - - torch::Tensor output = - torch::from_blob(output_mat.ptr(), /*sizes=*/{64, 64}); - return output.clone(); - } - - static auto registry = - torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective); - """ - - torch.utils.cpp_extension.load_inline( - name="warp_perspective", - cpp_sources=op_source, - extra_ldflags=["-lopencv_core", "-lopencv_imgproc"], - is_python_module=False, - verbose=True, - ) - - print(torch.ops.my_ops.warp_perspective) - - - @torch.jit.script - def compute(x, y): - if bool(x[0][0] == 42): - z = 5 - else: - z = 10 - x = torch.ops.my_ops.warp_perspective(x, torch.eye(3)) - return x.matmul(y) + z - - - compute.save("compute.pt") - - -This snippet generates ``compute.pt`` file which is TorchScript model that uses custom op ``my_ops.warp_perspective``. - -You need to have installed OpenCV for development to run it. -For Linux systems that can be done using the next commands: -CentOS: - -.. code-block:: shell - - yum install opencv-devel - -Ubuntu: - -.. code-block:: shell - - apt-get install libopencv-dev - -Making Android Application -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After we succeeded in having ``compute.pt``, we want to use this TorchScript model within Android application. Using general TorchScript models (without custom operators) on Android, using Java API, you can find `here `_. We can not use this approach for our case, as our model uses a custom operator(``my_ops.warp_perspective``), default TorchScript execution will fail to find it. - -Registration of ops is not exposed to PyTorch Java API, thus we need to build Android Application with native part (C++) and using LibTorch C++ API to implement and register the same custom operator for Android. As our operator uses the OpenCV library - we will use prebuilt OpenCV Android libraries and use the same functions from OpenCV. - -Let's start creating Android application in ``NativeApp`` folder. - -.. code-block:: shell - - mkdir NativeApp - cd NativeApp - -Android Application Build Setup -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Android Application build consists of the main gradle part and native build CMake part. -All the listings here are the full file listing, that if to recreate the whole structure, -you will be able to build and install the result Android Application without any code additions. - -Gradle Build Setup ------------------- -We will need to add gradle setup files: build.gradle, gradle.properties, settings.gradle. -More about Android Gradle build configurations you can find `here `_. - -``NativeApp/settings.gradle`` - -.. code-block:: gradle - - include ':app' - - -``NativeApp/gradle.properties`` - -.. code-block:: gradle - - android.useAndroidX=true - android.enableJetifier=true - - -``NativeApp/build.gradle`` - -.. code-block:: gradle - - buildscript { - repositories { - google() - jcenter() - } - dependencies { - classpath 'com.android.tools.build:gradle:3.5.0' - } - } - - allprojects { - repositories { - google() - jcenter() - } - } - - -In ``NativeApp/build.gradle`` we specify Android gradle plugin version `3.5.0`. This version is not recent. Still, we use it as PyTorch android gradle builds use this version. - -``NativeApp/settings.gradle`` shows that out project contains only one module - ``app``, which will be our Android Application. - -.. code-block:: shell - - mkdir app - cd app - - -``NativeApp/app/build.gradle`` - -.. code-block:: gradle - - apply plugin: 'com.android.application' - - repositories { - jcenter() - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } - } - - android { - configurations { - extractForNativeBuild - } - compileSdkVersion 28 - buildToolsVersion "29.0.2" - defaultConfig { - applicationId "org.pytorch.nativeapp" - minSdkVersion 21 - targetSdkVersion 28 - versionCode 1 - versionName "1.0" - externalNativeBuild { - cmake { - arguments "-DANDROID_STL=c++_shared" - } - } - } - buildTypes { - release { - minifyEnabled false - } - } - externalNativeBuild { - cmake { - path "CMakeLists.txt" - } - } - sourceSets { - main { - jniLibs.srcDirs = ['src/main/jniLibs'] - } - } - } - - dependencies { - implementation 'com.android.support:appcompat-v7:28.0.0' - - implementation 'org.pytorch:pytorch_android:1.6.0-SNAPSHOT' - extractForNativeBuild 'org.pytorch:pytorch_android:1.6.0-SNAPSHOT' - } - - task extractAARForNativeBuild { - doLast { - configurations.extractForNativeBuild.files.each { - def file = it.absoluteFile - copy { - from zipTree(file) - into "$buildDir/$file.name" - include "headers/**" - include "jni/**" - } - } - } - } - - tasks.whenTaskAdded { task -> - if (task.name.contains('externalNativeBuild')) { - task.dependsOn(extractAARForNativeBuild) - } - } - -This gradle build script registers dependencies on pytorch_android snapshots, -that are published on nightly channels. - -As they are published to nexus sonatype repository - we need to register that repository: -``https://oss.sonatype.org/content/repositories/snapshots``. - -In our application we need to use LibTorch C++ API in our application native build part. For this, we need access to prebuilt binaries and headers. They are prepacked in PyTorch Android builds, which is published in Maven repositories. - -To use PyTorch Android prebuilt libraries from gradle dependencies (which is aar files) - -we should add registration for configuration ``extractForNativeBuild``, -add this configuration in dependencies and put its definition in the end. - -``extractForNativeBuild`` task will call ``extractAARForNativeBuild`` task that unpacks pytorch_android aar -to gradle build directory. - -Pytorch_android aar contains LibTorch headers in ``headers`` folder -and prebuilt libraries for different Android abis in ``jni`` folder: -``$ANDROID_ABI/libpytorch_jni.so``, ``$ANDROID_ABI/libfbjni.so``. -We will use them for our native build. - -The native build is registered in this ``build.gradle`` with lines: - -.. code-block:: gradle - - android { - ... - externalNativeBuild { - cmake { - path "CMakeLists.txt" - } - } - ... - defaultConfig { - externalNativeBuild { - cmake { - arguments "-DANDROID_STL=c++_shared" - } - } - } - -We will use ``CMake`` configuration for a native build. Here we also specify that we will dynamically link with STL, as we have several libraries. More about this, you can find `here `_. - - -Native Build CMake Setup ------------------------- - -The native build will be configured in ``NativeApp/app/CMakeLists.txt``: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.4.1) - set(TARGET pytorch_nativeapp) - project(${TARGET} CXX) - set(CMAKE_CXX_STANDARD 14) - - set(build_DIR ${CMAKE_SOURCE_DIR}/build) - - set(pytorch_testapp_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp) - file(GLOB pytorch_testapp_SOURCES - ${pytorch_testapp_cpp_DIR}/pytorch_nativeapp.cpp - ) - - add_library(${TARGET} SHARED - ${pytorch_testapp_SOURCES} - ) - - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - - target_compile_options(${TARGET} PRIVATE - -fexceptions - ) - - set(BUILD_SUBDIR ${ANDROID_ABI}) - - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH) - - # OpenCV - if(NOT DEFINED ENV{OPENCV_ANDROID_SDK}) - message(FATAL_ERROR "Environment var OPENCV_ANDROID_SDK is not set") - endif() - - set(OPENCV_INCLUDE_DIR "$ENV{OPENCV_ANDROID_SDK}/sdk/native/jni/include") - - target_include_directories(${TARGET} PRIVATE - "${OPENCV_INCLUDE_DIR}" - ${PYTORCH_INCLUDE_DIRS}) - - set(OPENCV_LIB_DIR "$ENV{OPENCV_ANDROID_SDK}/sdk/native/libs/${ANDROID_ABI}") - - find_library(OPENCV_LIBRARY opencv_java4 - PATHS ${OPENCV_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - - target_link_libraries(${TARGET} - ${PYTORCH_LIBRARY} - ${FBJNI_LIBRARY} - ${OPENCV_LIBRARY} - log) - -Here we register only one source file ``pytorch_nativeapp.cpp``. - -On the previous step in ``NativeApp/app/build.gradle``, the task ``extractAARForNativeBuild`` extracts headers and native libraries to build directory. We set ``PYTORCH_INCLUDE_DIRS`` and ``PYTORCH_LINK_DIRS`` to them. - -After that, we find libraries ``libpytorch_jni.so`` and ``libfbjni.so`` and add them to the linking of our target. - -As we plan to use OpenCV functions to implement our custom operator ``my_ops::warp_perspective`` - we need to link to ``libopencv_java4.so``. It is packaged in OpenCV SDK for Android, that was downloaded on the Setup step. -In this configuration, we find it by environment variable ``OPENCV_ANDROID_SDK``. - -We also link with ``log`` library to be able to log our results to Android LogCat. - -As we link to OpenCV Android SDK's ``libopencv_java4.so``, we should copy it to ``NativeApp/app/src/main/jniLibs/${ANDROID_ABI}`` - -.. code-block:: shell - - cp -R $OPENCV_ANDROID_SDK/sdk/native/libs/* NativeApp/app/src/main/jniLibs/ - - -Adding the model file to the application ----------------------------------------- - -To package the TorschScript model ``compute.pt`` within our application we should copy it to assets folder: - -.. code-block:: shell - - mkdir -p NativeApp/app/src/main/assets - cp compute.pt NativeApp/app/src/main/assets - - -Android Application Manifest ----------------------------- - -Every Android application has a manifest. -Here we specify the application name, package, main activity. - -``NativeApp/app/src/main/AndroidManifest.xml``: - -.. code-block:: default - - - - - - - - - - - - - - - - -Sources -------- - -Java Code ---------- - -Now we are ready to implement our MainActivity in - -``NativeApp/app/src/main/java/org/pytorch/nativeapp/MainActivity.java``: - -.. code-block:: java - - package org.pytorch.nativeapp; - - import android.content.Context; - import android.os.Bundle; - import android.util.Log; - import androidx.appcompat.app.AppCompatActivity; - import java.io.File; - import java.io.FileOutputStream; - import java.io.IOException; - import java.io.InputStream; - import java.io.OutputStream; - - public class MainActivity extends AppCompatActivity { - - private static final String TAG = "PyTorchNativeApp"; - - public static String assetFilePath(Context context, String assetName) { - File file = new File(context.getFilesDir(), assetName); - if (file.exists() && file.length() > 0) { - return file.getAbsolutePath(); - } - - try (InputStream is = context.getAssets().open(assetName)) { - try (OutputStream os = new FileOutputStream(file)) { - byte[] buffer = new byte[4 * 1024]; - int read; - while ((read = is.read(buffer)) != -1) { - os.write(buffer, 0, read); - } - os.flush(); - } - return file.getAbsolutePath(); - } catch (IOException e) { - Log.e(TAG, "Error process asset " + assetName + " to file path"); - } - return null; - } - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - final String modelFileAbsoluteFilePath = - new File(assetFilePath(this, "compute.pt")).getAbsolutePath(); - NativeClient.loadAndForwardModel(modelFileAbsoluteFilePath); - } - } - - -In the previous step, when we copied our ``compute.pt`` to ``NativeApp/app/src/main/assets`` that file became an Android application asset, which will be packed in application. Android system provides only stream access to it. -To use this module from LibTorch, we need to materialize it as a file on the disk. ``assetFilePath`` function copies data from the asset input stream, writes it on the disk, and returns absolute file path for it. - -``OnCreate`` method of Activity is called just after Activity creation. In this method, we call ``assertFilePath`` and call ``NativeClient`` class that will dispatch it to native code through JNI call. - -``NativeClient`` is a helper class with an internal private class ``NativePeer``, which is responsible for working with the native part of our application. It has a static block that will load ``libpytorch_nativeapp.so``, that is build with ``CMakeLists.txt`` that we added on the previous step. The static block will be executed with the first reference of ``NativePeer`` class. It happens in ``NativeClient#loadAndForwardModel``. - -``NativeApp/app/src/main/java/org/pytorch/nativeapp/NativeClient.java``: - -.. code-block:: java - - package org.pytorch.nativeapp; - - public final class NativeClient { - - public static void loadAndForwardModel(final String modelPath) { - NativePeer.loadAndForwardModel(modelPath); - } - - private static class NativePeer { - static { - System.loadLibrary("pytorch_nativeapp"); - } - - private static native void loadAndForwardModel(final String modelPath); - } - } - -``NativePeer#loadAndForwardModel`` is declared as ``native``, it does not have definition in Java. Call to this method will be re-dispatched through JNI to C++ method in our ``libpytorch_nativeapp.so``, in ``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``. - -Native code ------------ - -Now we are ready to write a native part of our application. - -``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``: - -.. code-block:: cpp - - #include - #include - #include - #include - #include - #include - #define ALOGI(...) \ - __android_log_print(ANDROID_LOG_INFO, "PyTorchNativeApp", __VA_ARGS__) - #define ALOGE(...) \ - __android_log_print(ANDROID_LOG_ERROR, "PyTorchNativeApp", __VA_ARGS__) - - #include "jni.h" - - #include - #include - - namespace pytorch_nativeapp { - namespace { - torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data_ptr()); - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data_ptr()); - - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8}); - - torch::Tensor output = - torch::from_blob(output_mat.ptr(), /*sizes=*/{8, 8}); - return output.clone(); - } - - static auto registry = - torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective); - - template void log(const char *m, T t) { - std::ostringstream os; - os << t << std::endl; - ALOGI("%s %s", m, os.str().c_str()); - } - - struct JITCallGuard { - torch::autograd::AutoGradMode no_autograd_guard{false}; - torch::AutoNonVariableTypeMode non_var_guard{true}; - torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false}; - }; - } // namespace - - static void loadAndForwardModel(JNIEnv *env, jclass, jstring jModelPath) { - const char *modelPath = env->GetStringUTFChars(jModelPath, 0); - assert(modelPath); - JITCallGuard guard; - torch::jit::Module module = torch::jit::load(modelPath); - module.eval(); - torch::Tensor x = torch::randn({4, 8}); - torch::Tensor y = torch::randn({8, 5}); - log("x:", x); - log("y:", y); - c10::IValue t_out = module.forward({x, y}); - log("result:", t_out); - env->ReleaseStringUTFChars(jModelPath, modelPath); - } - } // namespace pytorch_nativeapp - - JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) { - JNIEnv *env; - if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { - return JNI_ERR; - } - - jclass c = env->FindClass("org/pytorch/nativeapp/NativeClient$NativePeer"); - if (c == nullptr) { - return JNI_ERR; - } - - static const JNINativeMethod methods[] = { - {"loadAndForwardModel", "(Ljava/lang/String;)V", - (void *)pytorch_nativeapp::loadAndForwardModel}, - }; - int rc = env->RegisterNatives(c, methods, - sizeof(methods) / sizeof(JNINativeMethod)); - - if (rc != JNI_OK) { - return rc; - } - - return JNI_VERSION_1_6; - } - - -This listing is quite long, and a few things intermixed here, we will follow control flow to understand how this code works. -The first place where the control flow arrives is ``JNI_OnLoad``. -This function is called after loading the library. It is responsible for registering native method, which is called when ``NativePeer#loadAndForwardModel`` called, here it is ``pytorch_nativeapp::loadAndForwardModel`` function. - -``pytorch_nativeapp::loadAndForwardModel`` takes as an argument model path. -First, we extract its ``const char*`` value and loading the module with ``torch::jit::load``. - -To load TorchScript model for mobile, we need to set these guards, because mobile build doesn't support -features like autograd for smaller build size, placed in ``struct JITCallGuard`` in this example. -It may change in the future. You can track the latest changes keeping an eye on the -`source in PyTorch GitHub `_. - -Implementation of method ``warp_perspective`` and registration of it is entirely the same as -in `tutorial for desktop build `_. - -Building the app ----------------- - -To specify to gradle where is Android SDK and Android NDK, we need to fill ``NativeApp/local.properties``. - -.. code-block:: shell - - cd NativeApp - echo "sdk.dir=$ANDROID_HOME" >> NativeApp/local.properties - echo "ndk.dir=$ANDROID_NDK" >> NativeApp/local.properties - - -To build the result ``apk`` file we run: - -.. code-block:: shell - - cd NativeApp - gradle app:assembleDebug - -To install the app on the connected device: - -.. code-block:: shell - - cd NativeApp - gradle app::installDebug - -After that, you can run the app on the device by clicking on PyTorchNativeApp icon. -Or you can do it from the command line: - -.. code-block:: shell - - adb shell am start -n org.pytorch.nativeapp/.MainActivity - -If you check the android logcat: - -.. code-block:: shell - - adb logcat -v brief | grep PyTorchNativeApp - - -You should see logs with tag 'PyTorchNativeApp' that prints x, y, and the result of the model forward, which we print with ``log`` function in ``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``. - -:: - - I/PyTorchNativeApp(26968): x: -0.9484 -1.1757 -0.5832 0.9144 0.8867 1.0933 -0.4004 -0.3389 - I/PyTorchNativeApp(26968): -1.0343 1.5200 -0.7625 -1.5724 -1.2073 0.4613 0.2730 -0.6789 - I/PyTorchNativeApp(26968): -0.2247 -1.2790 1.0067 -0.9266 0.6034 -0.1941 0.7021 -1.5368 - I/PyTorchNativeApp(26968): -0.3803 -0.0188 0.2021 -0.7412 -0.2257 0.5044 0.6592 0.0826 - I/PyTorchNativeApp(26968): [ CPUFloatType{4,8} ] - I/PyTorchNativeApp(26968): y: -1.0084 1.8733 0.5435 0.1087 -1.1066 - I/PyTorchNativeApp(26968): -1.9926 1.1047 0.5311 -0.4944 1.9178 - I/PyTorchNativeApp(26968): -1.5451 0.8867 1.0473 -1.7571 0.3909 - I/PyTorchNativeApp(26968): 0.4039 0.5085 -0.2776 0.4080 0.9203 - I/PyTorchNativeApp(26968): 0.3655 1.4395 -1.4467 -0.9837 0.3335 - I/PyTorchNativeApp(26968): -0.0445 0.8039 -0.2512 -1.3122 0.6543 - I/PyTorchNativeApp(26968): -1.5819 0.0525 1.5680 -0.6442 -1.3090 - I/PyTorchNativeApp(26968): -1.6197 -0.0773 -0.5967 -0.1105 -0.3122 - I/PyTorchNativeApp(26968): [ CPUFloatType{8,5} ] - I/PyTorchNativeApp(26968): result: 16.0274 9.0330 6.0124 9.8644 11.0493 - I/PyTorchNativeApp(26968): 8.7633 6.9657 12.3469 10.3159 12.0683 - I/PyTorchNativeApp(26968): 12.4529 9.4559 11.7038 7.8396 6.9716 - I/PyTorchNativeApp(26968): 8.5279 9.1780 11.3849 8.4368 9.1480 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): [ CPUFloatType{8,5} ] - - - -The full setup of this app you can find in `PyTorch Android Demo Application Repository `_. + diff --git a/recipes_source/bundled_inputs.rst b/recipes_source/bundled_inputs.rst deleted file mode 100644 index 1bdf5c7b7d2..00000000000 --- a/recipes_source/bundled_inputs.rst +++ /dev/null @@ -1,204 +0,0 @@ -(beta) Bundling inputs to PyTorch Models -================================================================== - -**Author**: `Jacob Szwejbka `_ - -Introduction ------------- - -This tutorial introduces the steps to use PyTorch's utility to bundle example or trivial inputs directly into your TorchScript Module. - -The interface of the model remains unchanged (other than adding a few methods), so it can still be safely deployed to production. The advantage of this standardized interface is that tools that run models can use it instead of having some sort of external file (or worse, document) that tells you how to run the model properly. - -Common case -------------------- - -One of the common cases—bundling an input to a model that only uses 'forward' for inference. - -1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting - -.. code:: python - - import torch - import torch.jit - import torch.utils - import torch.utils.bundled_inputs - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.lin = nn.Linear(10, 1) - - def forward(self, x): - return self.lin(x) - - model = Net() - scripted_module = torch.jit.script(model) - -2. **Create example input and attach to model** - -.. code:: python - - # For each method create a list of inputs and each input is a tuple of arguments - sample_input = [(torch.zeros(1,10),)] - - # Create model with bundled inputs, if type(input) is list then the input is bundled to 'forward' - bundled_model = bundle_inputs(scripted_module, sample_input) - - -3. **Run model with input as arguments** - -.. code:: python - - sample_inputs = bundled_model.get_all_bundled_inputs() - - print(bundled_model(*sample_inputs[0])) - - -Uncommon case --------------- - -An uncommon case would be bundling and retrieving inputs for functions beyond 'forward'. - -1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting - -.. code:: python - - import torch - import torch.jit - import torch.utils - import torch.utils.bundled_inputs - from typing import Dict - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.lin = nn.Linear(10, 1) - - def forward(self, x): - return self.lin(x) - - @torch.jit.export - def foo(self, x: Dict[String, int]): - return x['a'] + x['b'] - - - model = Net() - scripted_module = torch.jit.script(model) - -2. **Create example input and attach to model** - -.. code:: python - - # For each method create a list of inputs and each input is a tuple of arguments - example_dict = {'a' : 1, 'b' : 2} - sample_input = { - scripted_module.forward : [(torch.zeros(1,10),)], - scripted_module.foo : [(example_dict,)] - } - - # Create model with bundled inputs, if type(sample_input) is Dict then each callable key is mapped to its corresponding bundled input - bundled_model = bundle_inputs(scripted_module, sample_input) - - -3. **Retrieve inputs and run model on them** - -.. code:: python - - all_info = bundled_model.get_bundled_inputs_functions_and_info() - - # The return type for get_bundled_inputs_functions_and_info is complex, but essentially we are retrieving the name - # of a function we can use to get the bundled input for our models method - for func_name in all_info.keys(): - input_func_name = all_info[func_name]['get_inputs_function_name'][0] - func_to_run = getattr(bundled_model, input_func_name) - # retrieve input - sample_input = func_to_run() - model_function = getattr(bundled_model, func_name) - for i in range(len(sample_input)): - print(model_function(*sample_input[i])) - -Inflatable args -------------------- -Attaching inputs to models can result in nontrivial size increases. Inflatable args are a way to compress and decompress inputs to minimize this impact. - -.. note:: Any automatic compression, or parsing of inflatable args only happens to top level arguments in the input tuple. - - - ie if your model takes in a List type of inputs you would need to create an inflatable arg that returned a list not create a list of inflatable args. - -1. **Existing Inflatable args** - -The following input types are compressed automatically without requiring an explicit inflatable arg: - - Small contiguous tensors are cloned to have small storage. - - Inputs from torch.zeros, torch.ones, or torch.full are moved to their compact representations. - -.. code:: python - - # bundle_randn will generate a random tensor when the model is asked for bundled inputs - sample_inputs = [(torch.utils.bundled_inputs.bundle_randn((1,10)),)] - bundled_model = bundle_inputs(scripted_module, sample_inputs) - print(bundled_model.get_all_bundled_inputs()) - -2. **Creating your own** - -Inflatable args are composed of 2 parts, the deflated (compressed) argument, and an expression or function definition to inflate them. - -.. code:: python - - def create_example(*size, dtype=None): - """Generate a tuple of 2 random tensors both of the specified size""" - - deflated_input = (torch.zeros(1, dtype=dtype).expand(*size), torch.zeros(1, dtype=dtype).expand(*size)) - - # {0} is how you access your deflated value in the inflation expression - return torch.utils.bundled_inputs.InflatableArg( - value=stub, - fmt="(torch.randn_like({0}[0]), torch.randn_like({0}[1]))", - ) - -3. **Using a function instead** - If you need to create a more complicated input providing a function is an easy alternative - -.. code:: python - - sample = dict( - a=torch.zeros([10, 20]), - b=torch.zeros([1, 1]), - c=torch.zeros([10, 20]), - ) - - def condensed(t): - ret = torch.empty_like(t).flatten()[0].clone().expand(t.shape) - assert ret.storage().size() == 1 - return ret - - # An example of how to create an inflatable arg for a complex model input like Optional[Dict[str, Tensor]] - # here we take in a normal input, deflate it, and define an inflater function that converts the mapped tensors to random values - def bundle_optional_dict_of_randn(template: Optional[Dict[str, Tensor]]): - return torch.utils.bundled_inputs.InflatableArg( - value=( - None - if template is None - else {k: condensed(v) for (k, v) in template.items()} - ), - fmt="{}", - fmt_fn=""" - def {}(self, value: Optional[Dict[str, Tensor]]): - if value is not None: - output = {{}} - for k, v in value.items(): - output[k] = torch.randn_like(v) - return output - else: - return None - """, - ) - - sample_inputs = ( - bundle_optional_dict_of_randn(sample), - ) - - -Learn More ----------- -- To learn more about PyTorch Mobile, please refer to `PyTorch Mobile Home Page `_ diff --git a/recipes_source/cuda_rpc.rst b/recipes_source/cuda_rpc.rst deleted file mode 100644 index 0114664d53a..00000000000 --- a/recipes_source/cuda_rpc.rst +++ /dev/null @@ -1,147 +0,0 @@ -Direct Device-to-Device Communication with TensorPipe CUDA RPC -============================================================== - -.. note:: Direct device-to-device RPC (CUDA RPC) is introduced in PyTorch 1.8 - as a prototype feature. This API is subject to change. - -In this recipe, you will learn: - -- The high-level idea of CUDA RPC. -- How to use CUDA RPC. - - -Requirements ------------- - -- PyTorch 1.8+ -- `Getting Started With Distributed RPC Framework `_ - - -What is CUDA RPC? ------------------------------------- - -CUDA RPC supports directly sending Tensors from local CUDA memory to remote -CUDA memory. Prior to v1.8 release, PyTorch RPC only accepts CPU Tensors. As a -result, when an application needs to send a CUDA Tensor through RPC, it has -to first move the Tensor to CPU on the caller, send it via RPC, and then move -it to the destination device on the callee, which incurs both unnecessary -synchronizations and D2H and H2D copies. Since v1.8, RPC allows users to -configure a per-process global device map using the -`set_device_map `_ -API, specifying how to map local devices to remote devices. More specifically, -if ``worker0``'s device map has an entry ``"worker1" : {"cuda:0" : "cuda:1"}``, -all RPC arguments on ``"cuda:0"`` from ``worker0`` will be directly sent to -``"cuda:1"`` on ``worker1``. The response of an RPC will use the inverse of -the caller device map, i.e., if ``worker1`` returns a Tensor on ``"cuda:1"``, -it will be directly sent to ``"cuda:0"`` on ``worker0``. All intended -device-to-device direct communication must be specified in the per-process -device map. Otherwise, only CPU tensors are allowed. - -Under the hood, PyTorch RPC relies on `TensorPipe `_ -as the communication backend. PyTorch RPC extracts all Tensors from each -request or response into a list and packs everything else into a binary -payload. Then, TensorPipe will automatically choose a communication channel -for each Tensor based on Tensor device type and channel availability on both -the caller and the callee. Existing TensorPipe channels cover NVLink, InfiniBand, -SHM, CMA, TCP, etc. - -How to use CUDA RPC? ---------------------------------------- - -The code below shows how to use CUDA RPC. The model contains two linear layers -and is split into two shards. The two shards are placed on ``worker0`` and -``worker1`` respectively, and ``worker0`` serves as the master that drives the -forward and backward passes. Note that we intentionally skipped -`DistributedOptimizer `_ -to highlight the performance improvements when using CUDA RPC. The experiment -repeats the forward and backward passes 10 times and measures the total -execution time. It compares using CUDA RPC against manually staging to CPU -memory and using CPU RPC. - - -:: - - import torch - import torch.distributed.autograd as autograd - import torch.distributed.rpc as rpc - import torch.multiprocessing as mp - import torch.nn as nn - - import os - import time - - - class MyModule(nn.Module): - def __init__(self, device, comm_mode): - super().__init__() - self.device = device - self.linear = nn.Linear(1000, 1000).to(device) - self.comm_mode = comm_mode - - def forward(self, x): - # x.to() is a no-op if x is already on self.device - y = self.linear(x.to(self.device)) - return y.cpu() if self.comm_mode == "cpu" else y - - def parameter_rrefs(self): - return [rpc.RRef(p) for p in self.parameters()] - - - def measure(comm_mode): - # local module on "worker0/cuda:0" - lm = MyModule("cuda:0", comm_mode) - # remote module on "worker1/cuda:1" - rm = rpc.remote("worker1", MyModule, args=("cuda:1", comm_mode)) - # prepare random inputs - x = torch.randn(1000, 1000).cuda(0) - - tik = time.time() - for _ in range(10): - with autograd.context() as ctx: - y = rm.rpc_sync().forward(lm(x)) - autograd.backward(ctx, [y.sum()]) - # synchronize on "cuda:0" to make sure that all pending CUDA ops are - # included in the measurements - torch.cuda.current_stream("cuda:0").synchronize() - tok = time.time() - print(f"{comm_mode} RPC total execution time: {tok - tik}") - - - def run_worker(rank): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128) - - if rank == 0: - options.set_device_map("worker1", {0: 1}) - rpc.init_rpc( - f"worker{rank}", - rank=rank, - world_size=2, - rpc_backend_options=options - ) - measure(comm_mode="cpu") - measure(comm_mode="cuda") - else: - rpc.init_rpc( - f"worker{rank}", - rank=rank, - world_size=2, - rpc_backend_options=options - ) - - # block until all rpcs finish - rpc.shutdown() - - - if __name__=="__main__": - world_size = 2 - mp.spawn(run_worker, nprocs=world_size, join=True) - -Outputs are displayed below, which shows that CUDA RPC can help to achieve -34X speed up compared to CPU RPC in this experiment. - -:: - - cpu RPC total execution time: 2.3145179748535156 Seconds - cuda RPC total execution time: 0.06867480278015137 Seconds diff --git a/recipes_source/deployment_with_flask.rst b/recipes_source/deployment_with_flask.rst deleted file mode 100644 index 213a326429c..00000000000 --- a/recipes_source/deployment_with_flask.rst +++ /dev/null @@ -1,284 +0,0 @@ -Deploying with Flask -==================== - -In this recipe, you will learn: - -- How to wrap your trained PyTorch model in a Flask container to expose - it via a web API -- How to translate incoming web requests into PyTorch tensors for your - model -- How to package your model’s output for an HTTP response - -Requirements ------------- - -You will need a Python 3 environment with the following packages (and -their dependencies) installed: - -- PyTorch 1.5 -- TorchVision 0.6.0 -- Flask 1.1 - -Optionally, to get some of the supporting files, you'll need git. - -The instructions for installing PyTorch and TorchVision are available at -`pytorch.org`_. Instructions for installing Flask are available on `the -Flask site`_. - -What is Flask? --------------- - -Flask is a lightweight web server written in Python. It provides a -convenient way for you to quickly set up a web API for predictions from -your trained PyTorch model, either for direct use, or as a web service -within a larger system. - -Setup and Supporting Files --------------------------- - -We're going to create a web service that takes in images, and maps them -to one of the 1000 classes of the ImageNet dataset. To do this, you'll -need an image file for testing. Optionally, you can also get a file that -will map the class index output by the model to a human-readable class -name. - -Option 1: To Get Both Files Quickly -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can pull both of the supporting files quickly by checking out the -TorchServe repository and copying them to your working folder. *(NB: -There is no dependency on TorchServe for this tutorial - it's just a -quick way to get the files.)* Issue the following commands from your -shell prompt: - -:: - - git clone https://github.com/pytorch/serve - cp serve/examples/image_classifier/kitten.jpg . - cp serve/examples/image_classifier/index_to_name.json . - -And you've got them! - -Option 2: Bring Your Own Image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``index_to_name.json`` file is optional in the Flask service below. -You can test your service with your own image - just make sure it's a -3-color JPEG. - -Building Your Flask Service ---------------------------- - -The full Python script for the Flask service is shown at the end of this -recipe; you can copy and paste that into your own ``app.py`` file. Below -we'll look at individual sections to make their functions clear. - -Imports -~~~~~~~ - -:: - - import torchvision.models as models - import torchvision.transforms as transforms - from PIL import Image - from flask import Flask, jsonify, request - -In order: - -- We'll be using a pre-trained DenseNet model from - ``torchvision.models`` -- ``torchvision.transforms`` contains tools for manipulating your image - data -- Pillow (``PIL``) is what we'll use to load the image file initially -- And of course we'll need classes from ``flask`` - -Pre-Processing -~~~~~~~~~~~~~~ - -:: - - def transform_image(infile): - input_transforms = [transforms.Resize(255), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], - [0.229, 0.224, 0.225])] - my_transforms = transforms.Compose(input_transforms) - image = Image.open(infile) - timg = my_transforms(image) - timg.unsqueeze_(0) - return timg - -The web request gave us an image file, but our model expects a PyTorch -tensor of shape (N, 3, 224, 224) where *N* is the number of items in the -input batch. (We will just have a batch size of 1.) The first thing we -do is compose a set of TorchVision transforms that resize and crop the -image, convert it to a tensor, then normalize the values in the tensor. -(For more information on this normalization, see the documentation for -``torchvision.models_``.) - -After that, we open the file and apply the transforms. The transforms -return a tensor of shape (3, 224, 224) - the 3 color channels of a -224x224 image. Because we need to make this single image a batch, we use -the ``unsqueeze_(0)`` call to modify the tensor in place by adding a new -first dimension. The tensor contains the same data, but now has shape -(1, 3, 224, 224). - -In general, even if you're not working with image data, you will need to -transform the input from your HTTP request into a tensor that PyTorch -can consume. - -Inference -~~~~~~~~~ - -:: - - def get_prediction(input_tensor): - outputs = model.forward(input_tensor) - _, y_hat = outputs.max(1) - prediction = y_hat.item() - return prediction - -The inference itself is the simplest part: When we pass the input tensor -to them model, we get back a tensor of values that represent the model's -estimated likelihood that the image belongs to a particular class. The -``max()`` call finds the class with the maximum likelihood value, and -returns that value with the ImageNet class index. Finally, we extract -that class index from the tensor containing it with the ``item()`` call, and -return it. - -Post-Processing -~~~~~~~~~~~~~~~ - -:: - - def render_prediction(prediction_idx): - stridx = str(prediction_idx) - class_name = 'Unknown' - if img_class_map is not None: - if stridx in img_class_map is not None: - class_name = img_class_map[stridx][1] - - return prediction_idx, class_name - -The ``render_prediction()`` method maps the predicted class index to a -human-readable class label. It's typical, after getting the prediction -from your model, to perform post-processing to make the prediction ready -for either human consumption, or for another piece of software. - -Running The Full Flask App --------------------------- - -Paste the following into a file called ``app.py``: - -:: - - import io - import json - import os - - import torchvision.models as models - import torchvision.transforms as transforms - from PIL import Image - from flask import Flask, jsonify, request - - - app = Flask(__name__) - model = models.densenet121(pretrained=True) # Trained on 1000 classes from ImageNet - model.eval() # Turns off autograd - - - - img_class_map = None - mapping_file_path = 'index_to_name.json' # Human-readable names for Imagenet classes - if os.path.isfile(mapping_file_path): - with open (mapping_file_path) as f: - img_class_map = json.load(f) - - - - # Transform input into the form our model expects - def transform_image(infile): - input_transforms = [transforms.Resize(255), # We use multiple TorchVision transforms to ready the image - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], # Standard normalization for ImageNet model input - [0.229, 0.224, 0.225])] - my_transforms = transforms.Compose(input_transforms) - image = Image.open(infile) # Open the image file - timg = my_transforms(image) # Transform PIL image to appropriately-shaped PyTorch tensor - timg.unsqueeze_(0) # PyTorch models expect batched input; create a batch of 1 - return timg - - - # Get a prediction - def get_prediction(input_tensor): - outputs = model.forward(input_tensor) # Get likelihoods for all ImageNet classes - _, y_hat = outputs.max(1) # Extract the most likely class - prediction = y_hat.item() # Extract the int value from the PyTorch tensor - return prediction - - # Make the prediction human-readable - def render_prediction(prediction_idx): - stridx = str(prediction_idx) - class_name = 'Unknown' - if img_class_map is not None: - if stridx in img_class_map is not None: - class_name = img_class_map[stridx][1] - - return prediction_idx, class_name - - - @app.route('/', methods=['GET']) - def root(): - return jsonify({'msg' : 'Try POSTing to the /predict endpoint with an RGB image attachment'}) - - - @app.route('/predict', methods=['POST']) - def predict(): - if request.method == 'POST': - file = request.files['file'] - if file is not None: - input_tensor = transform_image(file) - prediction_idx = get_prediction(input_tensor) - class_id, class_name = render_prediction(prediction_idx) - return jsonify({'class_id': class_id, 'class_name': class_name}) - - - if __name__ == '__main__': - app.run() - -To start the server from your shell prompt, issue the following command: - -:: - - FLASK_APP=app.py flask run - -By default, your Flask server is listening on port 5000. Once the server -is running, open another terminal window, and test your new inference -server: - -:: - - curl -X POST -H "Content-Type: multipart/form-data" http://localhost:5000/predict -F "file=@kitten.jpg" - -If everything is set up correctly, you should recevie a response similar -to the following: - -:: - - {"class_id":285,"class_name":"Egyptian_cat"} - -Important Resources -------------------- - -- `pytorch.org`_ for installation instructions, and more documentation - and tutorials -- The `Flask site`_ has a `Quick Start guide`_ that goes into more - detail on setting up a simple Flask service - -.. _pytorch.org: https://pytorch.org -.. _Flask site: https://flask.palletsprojects.com/en/1.1.x/ -.. _Quick Start guide: https://flask.palletsprojects.com/en/1.1.x/quickstart/ -.. _torchvision.models: https://pytorch.org/vision/stable/models.html -.. _the Flask site: https://flask.palletsprojects.com/en/1.1.x/installation/ diff --git a/recipes_source/distributed_async_checkpoint_recipe.rst b/recipes_source/distributed_async_checkpoint_recipe.rst new file mode 100644 index 00000000000..e959883a25b --- /dev/null +++ b/recipes_source/distributed_async_checkpoint_recipe.rst @@ -0,0 +1,289 @@ +Asynchronous Saving with Distributed Checkpoint (DCP) +===================================================== + +**Author:** `Lucas Pasqualin `__, `Iris Zhang `__, `Rodrigo Kumpera `__, `Chien-Chin Huang `__ + +Checkpointing is often a bottle-neck in the critical path for distributed training workloads, incurring larger and larger costs as both model and world sizes grow. +One excellent strategy for offsetting this cost is to checkpoint in parallel, asynchronously. Below, we expand the save example +from the `Getting Started with Distributed Checkpoint Tutorial `__ +to show how this can be integrated quite easily with ``torch.distributed.checkpoint.async_save``. + + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to use DCP to generate checkpoints in parallel + * Effective strategies to optimize performance + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.4.0 or later + * `Getting Started with Distributed Checkpoint Tutorial `__ + + +Asynchronous Checkpointing Overview +------------------------------------ +Before getting started with Asynchronous Checkpointing, it's important to understand it's differences and limitations as compared to synchronous checkpointing. +Specifically: + +* Memory requirements - Asynchronous checkpointing works by first copying models into internal CPU-buffers. + This is helpful since it ensures model and optimizer weights are not changing while the model is still checkpointing, + but does raise CPU memory by a factor of ``checkpoint_size_per_rank X number_of_ranks``. Additionally, users should take care to understand + the memory constraints of their systems. Specifically, pinned memory implies the usage of ``page-lock`` memory, which can be scarce as compared to + ``pageable`` memory. + +* Checkpoint Management - Since checkpointing is asynchronous, it is up to the user to manage concurrently run checkpoints. In general, users can + employ their own management strategies by handling the future object returned form ``async_save``. For most users, we recommend limiting + checkpoints to one asynchronous request at a time, avoiding additional memory pressure per request. + + + +.. code-block:: python + + import os + + import torch + import torch.distributed as dist + import torch.distributed.checkpoint as dcp + import torch.multiprocessing as mp + import torch.nn as nn + + from torch.distributed.fsdp import fully_shard + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful + + CHECKPOINT_DIR = "checkpoint" + + + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(16, 16) + self.relu = nn.ReLU() + self.net2 = nn.Linear(16, 8) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + + def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355 " + + # initialize the process group + dist.init_process_group("gloo", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + + def cleanup(): + dist.destroy_process_group() + + + def run_fsdp_checkpoint_save_example(rank, world_size): + print(f"Running basic FSDP checkpoint saving example on rank {rank}.") + setup(rank, world_size) + + # create a model and move it to GPU with id rank + model = ToyModel().to(rank) + model = fully_shard(model) + + loss_fn = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + + checkpoint_future = None + for step in range(10): + optimizer.zero_grad() + model(torch.rand(8, 16, device="cuda")).sum().backward() + optimizer.step() + + # waits for checkpointing to finish if one exists, avoiding queuing more then one checkpoint request at a time + if checkpoint_future is not None: + checkpoint_future.result() + + state_dict = { "app": AppState(model, optimizer) } + checkpoint_future = dcp.async_save(state_dict, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}") + + cleanup() + + + if __name__ == "__main__": + world_size = torch.cuda.device_count() + print(f"Running async checkpoint example on {world_size} devices.") + mp.spawn( + run_fsdp_checkpoint_save_example, + args=(world_size,), + nprocs=world_size, + join=True, + ) + + +Even more performance with Pinned Memory +----------------------------------------- +If the above optimization is still not performant enough, you can take advantage of an additional optimization for GPU models which utilizes a pinned memory buffer for checkpoint staging. +Specifically, this optimization attacks the main overhead of asynchronous checkpointing, which is the in-memory copying to checkpointing buffers. By maintaining a pinned memory buffer between +checkpoint requests users can take advantage of direct memory access to speed up this copy. + +.. note:: + The main drawback of this optimization is the persistence of the buffer in between checkpointing steps. Without + the pinned memory optimization (as demonstrated above), any checkpointing buffers are released as soon as + checkpointing is finished. With the pinned memory implementation, this buffer is maintained between steps, + leading to the same + peak memory pressure being sustained through the application life. + + +.. code-block:: python + + import os + + import torch + import torch.distributed as dist + import torch.distributed.checkpoint as dcp + import torch.multiprocessing as mp + import torch.nn as nn + + from torch.distributed.fsdp import fully_shard + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful + from torch.distributed.checkpoint import FileSystemWriter as StorageWriter + + CHECKPOINT_DIR = "checkpoint" + + + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(16, 16) + self.relu = nn.ReLU() + self.net2 = nn.Linear(16, 8) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + + def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355 " + + # initialize the process group + dist.init_process_group("gloo", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + + def cleanup(): + dist.destroy_process_group() + + + def run_fsdp_checkpoint_save_example(rank, world_size): + print(f"Running basic FSDP checkpoint saving example on rank {rank}.") + setup(rank, world_size) + + # create a model and move it to GPU with id rank + model = ToyModel().to(rank) + model = fully_shard(model) + + loss_fn = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + + # The storage writer defines our 'staging' strategy, where staging is considered the process of copying + # checkpoints to in-memory buffers. By setting `cached_state_dict=True`, we enable efficient memory copying + # into a persistent buffer with pinned memory enabled. + # Note: It's important that the writer persists in between checkpointing requests, since it maintains the + # pinned memory buffer. + writer = StorageWriter(cache_staged_state_dict=True, path=CHECKPOINT_DIR) + checkpoint_future = None + for step in range(10): + optimizer.zero_grad() + model(torch.rand(8, 16, device="cuda")).sum().backward() + optimizer.step() + + state_dict = { "app": AppState(model, optimizer) } + if checkpoint_future is not None: + # waits for checkpointing to finish, avoiding queuing more then one checkpoint request at a time + checkpoint_future.result() + checkpoint_future = dcp.async_save(state_dict, storage_writer=writer, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}") + + cleanup() + + + if __name__ == "__main__": + world_size = torch.cuda.device_count() + print(f"Running fsdp checkpoint example on {world_size} devices.") + mp.spawn( + run_fsdp_checkpoint_save_example, + args=(world_size,), + nprocs=world_size, + join=True, + ) + + +Conclusion +---------- +In conclusion, we have learned how to use DCP's :func:`async_save` API to generate checkpoints off the critical training path. We've also learned about the +additional memory and concurrency overhead introduced by using this API, as well as additional optimizations which utilize pinned memory to speed things up +even further. + +- `Saving and loading models tutorial `__ +- `Getting started with FullyShardedDataParallel tutorial `__ diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst index 6a70bb02b0b..de31d430402 100644 --- a/recipes_source/distributed_checkpoint_recipe.rst +++ b/recipes_source/distributed_checkpoint_recipe.rst @@ -33,6 +33,7 @@ DCP is different from :func:`torch.save` and :func:`torch.load` in a few signifi * It produces multiple files per checkpoint, with at least one per rank. * It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead. +* DCP offers special handling of Stateful objects (formally defined in `torch.distributed.checkpoint.stateful`), automatically calling both `state_dict` and `load_state_dict` methods if they are defined. .. note:: The code in this tutorial runs on an 8-GPU server, but it can be easily @@ -58,13 +59,43 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input import torch.multiprocessing as mp import torch.nn as nn - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.checkpoint.state_dict import get_state_dict - from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType + from torch.distributed.fsdp import fully_shard + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful CHECKPOINT_DIR = "checkpoint" + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + class ToyModel(nn.Module): def __init__(self): super(ToyModel, self).__init__() @@ -95,7 +126,7 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input # create a model and move it to GPU with id rank model = ToyModel().to(rank) - model = FSDP(model) + model = fully_shard(model) loss_fn = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.1) @@ -104,14 +135,8 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input model(torch.rand(8, 16, device="cuda")).sum().backward() optimizer.step() - # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT - model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer) - state_dict = { - "model": model_state_dict, - "optimizer": optimizer_state_dict - } - dcp.save(state_dict,checkpoint_id=CHECKPOINT_DIR) - + state_dict = { "app": AppState(model, optimizer) } + dcp.save(state_dict, checkpoint_id=CHECKPOINT_DIR) cleanup() @@ -126,7 +151,7 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input join=True, ) -Please go ahead and check the `checkpoint` directory. You should see 8 checkpoint files as shown below. +Please go ahead and check the `checkpoint` directory. You should see checkpoint files corresponding to the number of files as shown below. For example, if you have 8 devices, you should see 8 files. .. figure:: /_static/img/distributed/distributed_checkpoint_generated_files.png :width: 100% @@ -152,15 +177,46 @@ The reason that we need the ``state_dict`` prior to loading is: import torch import torch.distributed as dist import torch.distributed.checkpoint as dcp + from torch.distributed.checkpoint.stateful import Stateful from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict import torch.multiprocessing as mp import torch.nn as nn - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import fully_shard CHECKPOINT_DIR = "checkpoint" + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + class ToyModel(nn.Module): def __init__(self): super(ToyModel, self).__init__() @@ -191,26 +247,15 @@ The reason that we need the ``state_dict`` prior to loading is: # create a model and move it to GPU with id rank model = ToyModel().to(rank) - model = FSDP(model) + model = fully_shard(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.1) - # generates the state dict we will load into - model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer) - state_dict = { - "model": model_state_dict, - "optimizer": optimizer_state_dict - } + + state_dict = { "app": AppState(model, optimizer)} dcp.load( state_dict=state_dict, checkpoint_id=CHECKPOINT_DIR, ) - # sets our state dicts on the model and optimizer, now that we've loaded - set_state_dict( - model, - optimizer, - model_state_dict=model_state_dict, - optim_state_dict=optimizer_state_dict - ) cleanup() @@ -237,7 +282,7 @@ the intent is to save or load in "non-distributed" style, meaning entirely in th import os import torch - import torch.distributed.checkpoint as DCP + import torch.distributed.checkpoint as dcp import torch.nn as nn @@ -274,6 +319,42 @@ the intent is to save or load in "non-distributed" style, meaning entirely in th run_checkpoint_load_example() +Formats +---------- +One drawback not yet mentioned is that DCP saves checkpoints in a format which is inherently different then those generated using torch.save. +Since this can be an issue when users wish to share models with users used to the torch.save format, or in general just want to add format flexibility +to their applications. For this case, we provide the ``format_utils`` module in ``torch.distributed.checkpoint.format_utils``. + +A command line utility is provided for the users convenience, which follows the following format: + +.. code-block:: bash + + python -m torch.distributed.checkpoint.format_utils + +In the command above, ``mode`` is one of ``torch_to_dcp`` or ``dcp_to_torch``. + + +Alternatively, methods are also provided for users who may wish to convert checkpoints directly. + +.. code-block:: python + + import os + + import torch + import torch.distributed.checkpoint as DCP + from torch.distributed.checkpoint.format_utils import dcp_to_torch_save, torch_save_to_dcp + + CHECKPOINT_DIR = "checkpoint" + TORCH_SAVE_CHECKPOINT_DIR = "torch_save_checkpoint.pth" + + # convert dcp model to torch.save (assumes checkpoint was generated as above) + dcp_to_torch_save(CHECKPOINT_DIR, TORCH_SAVE_CHECKPOINT_DIR) + + # converts the torch.save model back to DCP + torch_save_to_dcp(TORCH_SAVE_CHECKPOINT_DIR, f"{CHECKPOINT_DIR}_new") + + + Conclusion ---------- In conclusion, we have learned how to use DCP's :func:`save` and :func:`load` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`. diff --git a/recipes_source/distributed_comm_debug_mode.rst b/recipes_source/distributed_comm_debug_mode.rst new file mode 100644 index 00000000000..dc1a6e3e565 --- /dev/null +++ b/recipes_source/distributed_comm_debug_mode.rst @@ -0,0 +1,210 @@ +Getting Started with ``CommDebugMode`` +===================================================== + +**Author**: `Anshul Sinha `__ + + +In this tutorial, we will explore how to use ``CommDebugMode`` with PyTorch's +DistributedTensor (DTensor) for debugging by tracking collective operations in distributed training environments. + +Prerequisites +--------------------- + +* Python 3.8 - 3.11 +* PyTorch 2.2 or later + + +What is ``CommDebugMode`` and why is it useful +---------------------------------------------------- +As the size of models continues to increase, users are seeking to leverage various combinations +of parallel strategies to scale up distributed training. However, the lack of interoperability +between existing solutions poses a significant challenge, primarily due to the absence of a +unified abstraction that can bridge these different parallelism strategies. To address this +issue, PyTorch has proposed `DistributedTensor(DTensor) +`_ +which abstracts away the complexities of tensor communication in distributed training, +providing a seamless user experience. However, when dealing with existing parallelism solutions and +developing parallelism solutions using the unified abstraction like DTensor, the lack of transparency +about what and when the collective communications happens under the hood could make it challenging +for advanced users to identify and resolve issues. To address this challenge, ``CommDebugMode``, a +Python context manager will serve as one of the primary debugging tools for DTensors, enabling +users to view when and why collective operations are happening when using DTensors, effectively +addressing this issue. + + +Using ``CommDebugMode`` +------------------------ + +Here is how you can use ``CommDebugMode``: + +.. code-block:: python + + # The model used in this example is a MLPModule applying Tensor Parallel + comm_mode = CommDebugMode() + with comm_mode: + output = model(inp) + + # print the operation level collective tracing information + print(comm_mode.generate_comm_debug_tracing_table(noise_level=0)) + + # log the operation level collective tracing information to a file + comm_mode.log_comm_debug_tracing_table_to_file( + noise_level=1, file_name="transformer_operation_log.txt" + ) + + # dump the operation level collective tracing information to json file, + # used in the visual browser below + comm_mode.generate_json_dump(noise_level=2) + +This is what the output looks like for a MLPModule at noise level 0: + +.. code-block:: python + + Expected Output: + Global + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule.net1 + MLPModule.relu + MLPModule.net2 + FORWARD PASS + *c10d_functional.all_reduce: 1 + +To use ``CommDebugMode``, you must wrap the code running the model in ``CommDebugMode`` and call the API that +you want to use to display the data. You can also use a ``noise_level`` argument to control the verbosity +level of displayed information. Here is what each noise level displays: + +| 0. Prints module-level collective counts +| 1. Prints DTensor operations (not including trivial operations), module sharding information +| 2. Prints tensor operations (not including trivial operations) +| 3. Prints all operations + +In the example above, you can see that the collective operation, all_reduce, occurs once in the forward pass +of the ``MLPModule``. Furthermore, you can use ``CommDebugMode`` to pinpoint that the all-reduce operation happens +in the second linear layer of the ``MLPModule``. + + +Below is the interactive module tree visualization that you can use to upload your own JSON dump: + +.. raw:: html + + + + + + + CommDebugMode Module Tree + + + +
+
+ Drag file here +
+ +
+
+ + + + +Conclusion +------------------------------------------ + +In this recipe, we have learned how to use ``CommDebugMode`` to debug Distributed Tensors and +parallelism solutions that uses communication collectives with PyTorch. You can use your own +JSON outputs in the embedded visual browser. + +For more detailed information about ``CommDebugMode``, see +`comm_mode_features_example.py +`_ diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst index d41d6c1df18..3a04b8de4bf 100644 --- a/recipes_source/distributed_device_mesh.rst +++ b/recipes_source/distributed_device_mesh.rst @@ -31,7 +31,7 @@ Users can also easily manage the underlying process_groups/devices for multi-dim Why DeviceMesh is Useful ------------------------ DeviceMesh is useful when working with multi-dimensional parallelism (i.e. 3-D parallel) where parallelism composability is required. For example, when your parallelism solutions require both communication across hosts and within each host. -The image above shows that we can create a 2D mesh that connects the devices within each host, and connects each device with its counterpart on the other hosts in a homogenous setup. +The image above shows that we can create a 2D mesh that connects the devices within each host, and connects each device with its counterpart on the other hosts in a homogeneous setup. Without DeviceMesh, users would need to manually set up NCCL communicators, cuda devices on each process before applying any parallelism, which could be quite complicated. The following code snippet illustrates a hybrid sharding 2-D Parallel pattern setup without :class:`DeviceMesh`. @@ -121,7 +121,7 @@ users would not need to manually create and manage shard group and replicate gro import torch.nn as nn from torch.distributed.device_mesh import init_device_mesh - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy + from torch.distributed.fsdp import fully_shard as FSDP class ToyModel(nn.Module): @@ -136,9 +136,9 @@ users would not need to manually create and manage shard group and replicate gro # HSDP: MeshShape(2, 4) - mesh_2d = init_device_mesh("cuda", (2, 4)) + mesh_2d = init_device_mesh("cuda", (2, 4), mesh_dim_names=("dp_replicate", "dp_shard")) model = FSDP( - ToyModel(), device_mesh=mesh_2d, sharding_strategy=ShardingStrategy.HYBRID_SHARD + ToyModel(), device_mesh=mesh_2d ) Let's create a file named ``hsdp.py``. @@ -148,6 +148,26 @@ Then, run the following `torch elastic/torchrun `__ +- `2D parallel combining Tensor/Sequence Parallel with FSDP `__ - `Composable PyTorch Distributed with PT2 `__ diff --git a/recipes_source/distributed_optim_torchscript.rst b/recipes_source/distributed_optim_torchscript.rst index c5bac179f61..01bc497d38e 100644 --- a/recipes_source/distributed_optim_torchscript.rst +++ b/recipes_source/distributed_optim_torchscript.rst @@ -1,214 +1,6 @@ -Distributed Optimizer with TorchScript support -============================================================== +.. + TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 -.. note:: Distributed Optimizer with TorchScript support is introduced in PyTorch 1.8 - as a beta feature. This API is subject to change. - -In this recipe, you will learn: - -- The high-level idea of distributed optimizer with TorchScript support and what this feature brings -- How to write customized distributed optimizer that enables TorchScript support - - -Requirements ------------- - -- PyTorch 1.8+ -- `Getting Started With Distributed RPC Framework `_ - - -What is Distributed Optimizer? ------------------------------------- - -`DistributedOptimizer `_ takes a list of remote -parameters (RRef) and runs the optimizer locally on the workers where the parameters live, which is commonly used together -with Distributed RPC/Autograd to do model parallel training. It could use any of the local optimizer algorithms (either -pre-defined algorithms provided in ``torch.optim`` or custom defined ones) to apply the gradients on each worker. - - -What is Distributed Optimizer with TorchScript support? -------------------------------------------------------- - -Distributed Optimizer are widely used in distributed model parallel training, and in some -common use cases, training need to be done in multithreaded manner instead of multiprocess -due to performance concern and resource utilizations (or at least partially multithreaded, -i.e. Parameter Server hosting part of the model and parameters, with new thread updating the -parameters per request). PyTorch itself does not support multithreaded training natively as -it suffers from the Python's Global Interpreter Lock (GIL), but it could leverage -`TorchScript `_ to get rid of GIL and run the -model in a multithreaded way. - -For critical model training workloads, improving the training performance is an -important topic. Researchers often would like to implement different optimization strategies -with the graph representation (i.e. via operator fusion) or implement custom operator kernels -in order to speed up training. - -Distributed Optimizer with TorchScript support could help getting rid of GIL, thus improve -PyTorch's training performance in the multithreaded environment, it also unlocks the potential -to further enhance the performance by using advanced compiler technologies that TorchScript -offers (i.e. CPU/GPU fusion). - - -How to write a customized distributed optimizer with TorchScript support? -------------------------------------------------------------------------- - -The code below shows how to write a customized distributed optimizer given an existing local -optimizer implementation, which unlocks the TorchScript benefits including GIL removal and -performance improvement opportunities. - -Suppose that you already have a local optimizer that is currently used during training, -In this case we will use `quasi-hyperbolic momentum (QHM) `_ -as an example to show how to enable the TorchScript support, note that it also applies -to any custom optimizers that inherits from ``torch.optim.Optimizer``. - -First, we need to separate the computation and state management from the optimizer implementation, -this is so that we could extract the computation part and make it a free function, which is -TorchScript friendly. It has two benefits: 1. The computation logic becomes easier to inspect, -it allows us to quickly turn the parameter update/computation part into TorchScript, and utilize -TorchScript IR to do further optimizations (operator fusion, etc.) 2. Distributed Optimizer -underlying is using a different mechanisms to get gradients and update parameters (we store -gradients separately instead of directly populating the ``param.grad`` field during backward). -Separating the computation allows distributed optimizer to enable the possibility of optimizer -update in multithreaded mode, as it eliminates the possible race condition to ``param.grad``. - - -:: - - import torch - from torch import Tensor - from typing import List - - - def qhm_update(params: List[Tensor], - dp_list: List[Tensor], - momentum_buffer_list: List[Tensor], - lr: float, - nu: float, - weight_decay: float, - weight_decay_type: str, - momentum: float): - - for p, d_p, momentum_buffer in zip(params, dp_list, momentum_buffer_list): - if weight_decay != 0: - if weight_decay_type == "grad": - d_p.add_(weight_decay, p) - elif weight_decay_type == "direct": - p.mul_(1.0 - lr * weight_decay) - else: - raise ValueError("Invalid weight decay type provided") - - momentum_buffer.mul_(momentum).add_(1.0 - momentum, d_p) - - p.data.add_(-lr * nu, momentum_buffer) - p.data.add_(-lr * (1.0 - nu), d_p) - - - -Next we will define a distributed functional optimizer with TorchScript compatability to manage -the optimizer states and calls into the TorchScript compatible update function we defined above. -Note that a few conventions are different from normal custom optimizers: 1. We don't inherit -``torch.optim.Optimizer`` as TorchScript does not support polymorphism 2. ``step`` takes gradients -list instead of the loss closure. - -:: - - import torch - from torch import Tensor - from typing import List, Optional, Dict - - # define this as a TorchScript class - @torch.jit.script - class FunctionalQHM(object): - def __init__(self, - params: List[Tensor], - lr: float, - momentum: float, - nu: float, - weight_decay: float = 0.0, - weight_decay_type: str = "grad"): - if lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) - if momentum < 0.0: - raise ValueError("Invalid momentum value: {}".format(momentum)) - if weight_decay < 0.0: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - if weight_decay_type not in ("grad", "direct"): - raise ValueError("Invalid weight_decay_type value: {}".format(weight_decay_type)) - - self.defaults = { - "lr": lr, - "momentum": momentum, - "nu": nu, - "weight_decay": weight_decay, - } - self.weight_decay_type = weight_decay_type - - # NOTE: we only have one param_group here and don't allow user to add additional - # param group as it's not a common use case. - self.param_group = {"params": params} - - self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {}) - - def step(self, gradients: List[Optional[Tensor]]): - params = self.param_group['params'] - params_with_grad = [] - grads = [] - momentum_buffer_list: List[Tensor] = [] - - if len(params) != len(gradients): - raise ValueError( - "the gradients passed in does not equal to the size of the parameters!" - + f"Params length: {len(params)}. " - + f"Gradients length: {len(gradients)}" - ) - - for param, gradient in zip(self.param_group['params'], gradients): - if gradient is not None: - params_with_grad.append(param) - grads.append(gradient) - state = self.state[param] - state['momentum_buffer'] = torch.zeros_like(param, memory_format=torch.preserve_format) - momentum_buffer_list.append(state['momentum_buffer']) - - # calls into the update function we just defined - with torch.no_grad(): - qhm_update(params_with_grad, - grads, - momentum_buffer_list, - self.defaults['lr'], - self.defaults['nu'], - self.defaults['weight_decay'], - self.weight_decay_type, - self.defaults['momentum']) - - - -Finally, we register our newly defined distributed functional optimizer into the ``functional_optim_map`` -This is so that the ``DistributedOptimizer`` will try to pick up our custom implementation instead of the -pre-defined default ones. - -:: - - from torch.distributed.optim import DistributedOptimizer - - DistributedOptimizer.functional_optim_map[QHM] = FunctionalQHM - -Now you can use the ``QHM`` optimizer as normal in distributed training by passing it to -`DistributedOptimizer `_ - - -:: - - ... - remote_params_list = [...] - dist_optim = DistributedOptimizer( - QHM, remote_params_list, *args, **kwargs - ) - -DistributedOptimizer will automatically transform the QHM optimizer into the ``FunctionalQHM`` under the hood, -and enable the TorchScript support. This will unlock the performance that boosted by multithreaded training -and also give more potentials for further improvements (i.e. TorchScript fusion, etc.) - -Note that majority of PyTorch built-in optimizers are already using this methodology to speed up distributed -training. If you see warning about some optimizers haven't been converted yet, you can write your own conversion -by following this recipe. +.. warning:: + TorchScript is deprecated, please use + `torch.export `__ instead. \ No newline at end of file diff --git a/recipes_source/foreach_map.py b/recipes_source/foreach_map.py new file mode 100644 index 00000000000..0225a77e279 --- /dev/null +++ b/recipes_source/foreach_map.py @@ -0,0 +1,204 @@ +""" +Explicit horizontal fusion with foreach_map and torch.compile +=============================================================== + +**Author:** `Michael Lazos `_ +""" + +######################################################### +# Horizontal fusion is a key optimization in ML compilers. In eager, +# this is typically expressed using the torch._foreach* ops which parallelizes +# operations across a list of tensors. However, supporting all possible permutations +# of arguments is quite difficult (e.g. mixtures of scalars and lists). Foreach_map +# allows conversion of any pointwise op in ``torch`` to a horiztonally fused foreach +# variant. In this tutorial, we will demonstrate how to implement the Adam optimizer +# with ``foreach_map`` to generate a fully fused kernel. +# +# .. note:: +# +# This recipe describes a prototype feature. Prototype features are typically +# at an early stage for feedback and testing and are subject to change. +# +# Prerequisites +# ------------- +# +# * PyTorch v2.7.0 or later +# + +##################################################################### +# Model Setup +# ~~~~~~~~~~~~~~~~~~~~~ +# For this example, we'll use a simple sequence of linear layers. +# We instantiate an independent copy to compare the two optimizer implementations. +# +import torch + +# exit cleanly if we are on a device that doesn't support ``torch.compile`` +if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + +# Create simple model +model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] +) +model_copy = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] +) +input = torch.rand(1024, device="cuda") + +# run forward pass +output = model(input) +output_copy = model_copy(input) + +# run backward to populate the grads for our optimizer below +output.sum().backward() +output_copy.sum().backward() + +##################################################################### +# Helper functions for foreach_map implementation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In this section, we'll begin our implementation of the Adam optimizer. +# +from torch._higher_order_ops.foreach_map import foreach_map + +# Helper function to extract optimizer states from a torch.optim.Adam instance +def get_inputs(optim): + steps = [] + params = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + for group in optim.param_groups: + for p in group["params"]: + params.append(p) + grads.append(p.grad) + state = optim.state[p] + exp_avgs.append(state["exp_avg"]) + exp_avg_sqs.append(state["exp_avg_sq"]) + steps.append(state["step"]) + + return steps, params, exp_avgs, exp_avg_sqs + + +# Functions to update the different optimizer states +def update_exp_avg_sq(exp_avg_sq, grad, beta2): + return exp_avg_sq.mul(beta2).addcmul(grad, grad, value=1 - beta2) + +def update_param(param, step, exp_avg, exp_avg_sq, beta1, beta2, lr, eps): + bias_correction1 = 1 - torch.pow(beta1, step) + bias_correction2 = (1 - torch.pow(beta2, step)).sqrt() + step_size = (lr / bias_correction1).neg() + denom = (exp_avg_sq.sqrt() / (bias_correction2 * step_size)).add(eps / step_size) + return torch.add(param, torch.div(exp_avg, denom)) + +# Our full Adam implementation +def foreach_map_adam( + steps, + params, + exp_avgs, + exp_avg_sqs, + weight_decay=0, + beta1=0.9, + beta2=0.999, + lr=1e-3, + eps=1e-8, +): + with torch.no_grad(): + grads = [param.grad for param in params] + # update step + updated_steps = foreach_map(lambda x: x + 1, steps) + torch._foreach_copy_(steps, updated_steps) + + if weight_decay != 0: + foreach_map(torch.add, (grads,), alpha=weight_decay) + + # Higher-order operators (HOPs) cannot have multiple outputs at the moment + # need to call foreach_map once for each output + exp_avgs_updated = foreach_map(torch.lerp, exp_avgs, grads, 1 - beta1) + exp_avgs_sq_updated = foreach_map(update_exp_avg_sq, exp_avg_sqs, grads, beta2) + params_updated = foreach_map( + update_param, + params, + steps, + exp_avgs_updated, + exp_avgs_sq_updated, + beta1, + beta2, + lr, + eps, + ) + # Higher-order operators (HOPs) don't support input mutation today + # so manually update the states in-place + torch._foreach_copy_(exp_avgs, exp_avgs_updated) + torch._foreach_copy_(exp_avg_sqs, exp_avgs_sq_updated) + torch._foreach_copy_(params, params_updated) + return + +##################################################################### +# Setting up and running the compiled kernel +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In this section, we'll run our Adam optimizer +# and compare the results +# +# .. note:: +# +# ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher. +opt_eager = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) +opt_eager_copy = torch.optim.Adam(model_copy.parameters(), lr=torch.tensor(0.01)) + +# warm up the optimizer state dict +opt_eager.step() +opt_eager_copy.step() + +inputs = get_inputs(opt_eager_copy) +compiled_adam = torch.compile(foreach_map_adam) + +# optionally view the output code +torch._logging.set_logs(output_code=True) + +# Warmup runs to compile the function +for _ in range(5): + opt_eager.step() + compiled_adam(*inputs) + +for eager_p, compile_p in zip(opt_eager.param_groups[0]["params"], opt_eager_copy.param_groups[0]["params"]): + torch.allclose(eager_p, compile_p) + +# Benchmark performance + + # Let's define a helpful benchmarking function: +import torch.utils.benchmark as benchmark + +def benchmark_torch_function_in_microseconds(f, *args, **kwargs): + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + ) + return t0.blocked_autorange().mean * 1e6 + +eager_runtime = benchmark_torch_function_in_microseconds(opt_eager.step) +compiled_runtime = benchmark_torch_function_in_microseconds(lambda: compiled_adam(*inputs)) + +assert eager_runtime > compiled_runtime + +print(f"eager runtime: {eager_runtime}us") +print(f"compiled runtime: {compiled_runtime}us") + + + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# In this tutorial, we successfully implemented a custom fully-fused Adam optimizer using foreach_map. +# By leveraging the power of foreach_map and torch.compile, we were able to create an optimized version of the Adam +# optimizer that can be used in various machine learning applications. This tutorial provides a comprehensive guide +# on how to use foreach_map and torch.compile to optimize machine learning models, and serves as a +# valuable resource for developers looking to improve the performance of their models with horizontal fusion. +# +# See also: +# +# * `Compiled optimizer tutorial `__ - an intro into the compiled optimizer. +# * `Compiling the optimizer with PT2 `__ - deeper technical details on the compiled optimizer. diff --git a/recipes_source/fuse.rst b/recipes_source/fuse.rst deleted file mode 100644 index c6c69762962..00000000000 --- a/recipes_source/fuse.rst +++ /dev/null @@ -1,157 +0,0 @@ -Fuse Modules Recipe -===================================== - -This recipe demonstrates how to fuse a list of PyTorch modules into a single module and how to do the performance test to compare the fused model with its non-fused version. - -Introduction ------------- - -Before quantization is applied to a model to reduce its size and memory footprint (see `Quantization Recipe `_ for details on quantization), the list of modules in the model may be fused first into a single module. Fusion is optional, but it may save on memory access, make the model run faster, and improve its accuracy. - - -Pre-requisites --------------- - -PyTorch 1.6.0 or 1.7.0 - -Steps --------------- - -Follow the steps below to fuse an example model, quantize it, script it, optimize it for mobile, save it and test it with the Android benchmark tool. - -1. Define the Example Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use the same example model defined in the `PyTorch Mobile Performance Recipes `_: - -:: - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - - class AnnotatedConvBnReLUModel(torch.nn.Module): - def __init__(self): - super(AnnotatedConvBnReLUModel, self).__init__() - self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float) - self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float) - self.relu = torch.nn.ReLU(inplace=True) - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - x = self.quant(x) - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - x = self.dequant(x) - return x - - -2. Generate Two Models with and without `fuse_modules` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Add the following code below the model definition above and run the script: - -:: - - model = AnnotatedConvBnReLUModel() - print(model) - - def prepare_save(model, fused): - model.qconfig = torch.quantization.get_default_qconfig('qnnpack') - torch.quantization.prepare(model, inplace=True) - torch.quantization.convert(model, inplace=True) - torchscript_model = torch.jit.script(model) - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "model.pt" if not fused else "model_fused.pt") - - prepare_save(model, False) - - model = AnnotatedConvBnReLUModel() - model_fused = torch.quantization.fuse_modules(model, [['bn', 'relu']], inplace=False) - print(model_fused) - - prepare_save(model_fused, True) - - -The graphs of the original model and its fused version will be printed as follows: - -:: - - AnnotatedConvBnReLUModel( - (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False) - (bn): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (quant): QuantStub() - (dequant): DeQuantStub() - ) - - AnnotatedConvBnReLUModel( - (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False) - (bn): BNReLU2d( - (0): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (1): ReLU(inplace=True) - ) - (relu): Identity() - (quant): QuantStub() - (dequant): DeQuantStub() - ) - -In the second fused model output, the first item `bn` in the list is replaced with the fused module, and the rest of the modules (`relu` in this example) is replaced with identity. In addition, the non-fused and fused versions of the model `model.pt` and `model_fused.pt` are generated. - -3. Build the Android benchmark Tool -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Get the PyTorch source and build the Android benchmark tool as follows: - -:: - - git clone --recursive https://github.com/pytorch/pytorch - cd pytorch - git submodule update --init --recursive - BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DBUILD_BINARY=ON - - -This will generate the Android benchmark binary `speed_benchmark_torch` in the `build_android/bin` folder. - -4. Test Compare the Fused and Non-Fused Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Connect your Android device, then copy `speed_benchmark_torch` and the model files and run the benchmark tool on them: - -:: - - adb push build_android/bin/speed_benchmark_torch /data/local/tmp - adb push model.pt /data/local/tmp - adb push model_fused.pt /data/local/tmp - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float" - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model_fused.pt" --input_dims="1,3,224,224" --input_type="float" - - -The results from the last two commands should be like: - -:: - - Main run finished. Microseconds per iter: 6189.07. Iters per second: 161.575 - -and - -:: - - Main run finished. Microseconds per iter: 6216.65. Iters per second: 160.858 - -For this example model, there is no much performance difference between the fused and non-fused models. But the similar steps can be used to fuse and prepare a real deep model and test to see the performance improvement. Keep in mind that currently `torch.quantization.fuse_modules` only fuses the following sequence of modules: - -* conv, bn -* conv, bn, relu -* conv, relu -* linear, relu -* bn, relu - -If any other sequence list is provided to the `fuse_modules` call, it will simply be ignored. - -Learn More ---------------- - -See `here `_ for the official documentation of `torch.quantization.fuse_modules`. diff --git a/recipes_source/inference_tuning_on_aws_graviton.rst b/recipes_source/inference_tuning_on_aws_graviton.rst deleted file mode 100644 index 08d3515ce9a..00000000000 --- a/recipes_source/inference_tuning_on_aws_graviton.rst +++ /dev/null @@ -1,368 +0,0 @@ -(Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors -====================================================================== - -**Author**: `Sunita Nadampalli `_ - -`AWS Graviton `_ is a series of ARM-based processors designed by AWS. AWS Graviton3 processors are optimized for Machine Learning (ML) workloads, including support for ``bfloat16``, Scalable Vector Extension (SVE) and twice the Single Instruction Multiple Data (SIMD) bandwidth compared to Graviton2. - -PyTorch provides native reference ATen kernels for the machine learning operators like convolutions, matmul, relu, etc. These operators can be accelerated with platform specific kernel implementations from Basic Linear Algebra (BLAS) libraries. On AWS Graviton CPUs, MKLDNN with Arm Compute Library (`ACL `_) and `OpenBLAS `_ libraries provide optimized implementations for a subset of the operators. Both these libraries are integrated into PyTorch with PyTorch 2.0 version. - -In this tutorial we will cover how to achieve the best inference performance for linear layer neural network on AWS Graviton3 CPUs (`AWS c7g instance `_) with ``bfloa16`` kernels and with the right backend selection. - -Contents --------- -1. Basic Usage -2. Speed up inference with Bfloat16 fast math kernels -3. Improve inference performance with OpenBLAS for smaller batch dimensions -4. Optimize memory allocation overhead with Linux Transparent huge pages -5. Conclusion - -.. note:: - To successfully run this tutorial and reproduce the speedup numbers shown below, you need an instance from the Graviton3 family (``c7g/r7g/m7g``) of hardware. For this tutorial, we used the `c7g.xl (4vcpu) instance `_ . - -Basic Usage ---------------- - -PyTorch natively supports AWS Graviton3 optimizations starting with PyTorch 2.0 version. -Please refer to this `blog `_ for more details on the optimizations. - -1. Install PyTorch by running the following command: - - .. code-block:: - - python3 -m pip install torch - -2. We will start by importing the required dependencies and defining the device will run on: - -.. code-block:: python - - import torch - import torch.nn as nn - from torch.profiler import profile, record_function, ProfilerActivity - - # AWS Graviton3 cpu - device = ("cpu") - print(f"Using {device} device") - - -3. Given linear layers are at the heart of several neural networks, including transformers, we take a linear layer for this demo. We define our neural network by subclassing ``nn.Module``, and initializing the layers in ``__init__``. We construct the network with a typical large language model parameters to match the real world scenario: - -.. code-block:: python - - class MyNeuralNetwork(nn.Module): - def __init__(self): - super().__init__() - self.flatten = nn.Flatten() - self.linear_relu_stack = nn.Sequential( - nn.Linear(4096, 4096), - nn.ReLU(), - nn.Linear(4096, 11008), - nn.ReLU(), - nn.Linear(11008, 10), - ) - - def forward(self, x): - x = self.flatten(x) - logits = self.linear_relu_stack(x) - return logits - -4. Let's create an instance of ``MyNeuralNetwork``, and move it to the device: - -.. code-block:: python - - model = MyNeuralNetwork().to(device) - print(model) - -Next, let's get the prediction probabilities by passing them through an instance of the ``nn.Softmax`` module: - -.. code-block:: python - - X = torch.rand(1, 64, 64, device=device) - logits = model(X) - pred_probab = nn.Softmax(dim=1)(logits) - y_pred = pred_probab.argmax(1) - print(f"Predicted class: {y_pred}") - -output: - -.. code-block:: - - Predicted class: tensor([2]) - -Our network functionality is verified. Next, we will profile the performance. Lets' check two different scenarios: small and large batch dimensions. - -**Scenario 1:** A larger batch dimension, for example 256: - -.. code-block:: python - - # warm it up first and loop over multiple times to have enough execution time - - X = torch.rand(256, 64, 64, device=device) - - with torch.set_grad_enabled(False): - for _ in range(50): - model(X) #Warmup - with profile(activities=[ProfilerActivity.CPU]) as prof: - with record_function("mymodel_inference"): - for _ in range(100): - model(X) - - print(prof.key_averages().table(sort_by="self_cpu_time_total")) - - -Following is the profiler output with the default PyTorch configuration: - -.. table:: - :widths: auto - - ====================== ============ =========== ============= =========== ============ ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ =========== ============= =========== ============ ============ - aten::addmm 97.61% 15.813s 98.61% 15.977s 53.255ms 300 - aten::clamp_min 1.09% 177.032ms 1.09% 177.032ms 885.160us 200 - aten::copy 1.00% 162.054ms 1.00% 162.054ms 540.180us 300 - mymodel_inference 0.22% 35.738ms 100.00% 16.201s 16.201s 1 - aten::linear 0.02% 2.955ms 98.66% 15.985s 53.282ms 300 - aten::t 0.01% 2.421ms 0.03% 5.043ms 16.810us 300 - aten::relu 0.01% 2.356ms 1.11% 179.388ms 896.940us 200 - ====================== ============ =========== ============= =========== ============ ============ - -**Self CPU time total:** 16.201s - - -Speed up Inference with ``bfloat16`` Fast Math Kernels ----------------------------------------------------------- - -AWS Graviton3 processors support `bfloat16 MMLA instructions `_. Arm Compute Library (`ACL `_) provides optimized ``bfloat16`` General Matrix Multiplication (GEMM) kernels for AWS Graviton processors, and are integrated into PyTorch via MKLDNN backend starting with PyTorch 2.0. The inference performance can be optimized with the fast math GEMM kernels. The fast math mode is not enabled by default because these kernels perform GEMM in ``bfloat16`` precision instead of ``float``, and hence results in a slight drop in the model inference accuracy. However, the accuracy drop is within the ``cosine similarity`` threshold defined for ``bfloat16`` backend in ``torchbench`` test suite, and hence acceptable for majority of the applications. To enable the fast math GEMM kernels, set the following environment variable: - -.. code-block:: bash - - $ export DNNL_DEFAULT_FPMATH_MODE=BF16 - - -When you run the above inference script, you should see the following profiler output with the MKLDNN fast math mode enabled: - -.. table:: - :widths: auto - - ====================== ============ ============ ============ ============ ============ ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ ============ ============ ============ ============ ============ - aten::addmm 95.61% 6.943s 97.10% 7.052s 23.507ms 300 - aten::clamp_min 2.31% 167.653ms 2.31% 167.653ms 838.265us 200 - aten::copy 1.48% 107.593ms 1.48% 107.593ms 358.643us 300 - mymodel_inference 0.43% 31.167ms 100.00% 7.262s 7.262s 1 - aten::linear 0.04% 2.911ms 97.21% 7.060s 23.533ms 300 - aten::t 0.03% 2.414ms 0.07% 4.892ms 16.307us 300 - aten::relu 0.03% 2.281ms 2.34% 169.934ms 849.670us 200 - ====================== ============ ============ ============ ============ ============ ============ - -**Self CPU time total:** 7.262s - - -This is around ``2x (7.262s vs 16.201s)`` performance improvement with the ``bfloat16`` fastmath kernels. Next, let’s look at the smaller batch dimension scenario. - -**Scenario 2:** A smaller batch dimension, for example, 32: - -.. code-block:: python - - X = torch.rand(32, 64, 64, device=device) - with torch.set_grad_enabled(False): - for _ in range(50): - model(X) #Warmup - with profile(activities=[ProfilerActivity.CPU]) as prof: - with record_function("mymodel_inference"): - for _ in range(100): - model(X) - - print(prof.key_averages().table(sort_by="self_cpu_time_total")) - - -You should see the following profiler output when the above script is run with the PyTorch default configuration: - -.. table:: - :widths: auto - - ====================== ============= ============ ============ ============ ============ ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============= ============ ============ ============ ============ ============ - aten::addmm 95.51% 5.821s 97.04% 5.914s 19.713ms 300 - aten::clamp_min 2.33% 142.244ms 2.33% 142.244ms 711.220us 200 - aten::copy 1.51% 92.322ms 1.51% 92.322ms 307.740us 300 - mymodel_inference 0.45% 27.713ms 100.00% 6.094s 6.094s 1 - aten::linear 0.04% 2.495ms 97.16% 5.921s 19.736ms 300 - aten::t 0.03% 2.131ms 0.07% 4.441ms 14.803us 300 - aten::relu 0.03% 1.942ms 2.37% 144.186ms 720.930us 200 - ====================== ============= ============ ============ ============ ============ ============ - -**Self CPU time total:** 6.094s - - -The following output is the profiler output when run with the MKLDNN fast math mode enabled: - -.. code-block:: bash - - $ export DNNL_DEFAULT_FPMATH_MODE=BF16 - -.. table:: - :widths: auto - - ====================== ============ ============ ============ ============ ============ ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ ============ ============ ============ ============ ============= - aten::addmm 93.31% 3.848s 95.66% 3.944s 13.148ms 300 - aten::clamp_min 3.43% 141.309ms 3.43% 141.309ms 706.545us 200 - aten::copy 2.33% 95.916ms 2.33% 95.916ms 319.720us 300 - mymodel_inference 0.67% 27.431ms 100.00% 4.123s 4.123s 1 - aten::linear 0.06% 2.471ms 95.83% 3.951s 13.170ms 300 - aten::t 0.05% 2.027ms 0.10% 4.243ms 14.143us 300 - aten::relu 0.05% 1.928ms 3.47% 143.237ms 716.185us 200 - ====================== ============ ============ ============ ============ ============ ============= - -**Self CPU time total:** 4.123s - -The MKLDNN fast math mode yields approximately a **1.47x (4.123s vs 6.094s)** performance improvement for smaller batch dimensions. Although this improvement is noteworthy, the overall performance still leaves room for improvement. This is because of the runtime overhead (weights reorders and kernel launch time) from oneDNN and ACL backend outweighing the compute benefits from the ACL GEMM kernels for the smaller batch compute. - - -Improve Inference Performance with OpenBLAS for Smaller Batch Dimensions ------------------------------------------------------------------------- - -The inference performance for smaller batch dimensions can be improved by offloading the smaller shapes from MKLDNN to OpenBLAS backend. We are working on making the backend selection automatic, with robust heuristics, for the future releases. Till the heuristics are implemented, the smaller shapes can be offloaded to OpenBLAS by increasing the threshold for MKLDNN backend selection. In the following example, we use ``64`` as the threshold, so that input with ``batch dimension of 32`` is not dispatched to MKLDNN. Instead, it is dispatched to OpenBLAS. - -.. code-block:: bash - - $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64 - -Here is the profiler output with OpenBLAS backend: - -.. table:: - :widths: auto - - ====================== ============ ============ ============ ============= ============ ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ ============ ============ ============= ============ ============= - aten::addmm 96.25% 1.958s 97.51% 1.984s 6.612ms 300 - aten::clamp_min 1.28% 26.124ms 1.28% 26.124ms 130.620us 200 - aten::copy 1.23% 24.951ms 1.23% 24.951ms 83.170us 300 - mymodel_inference 0.86% 17.423ms 100.00% 2.034s 2.034s 1 - aten::linear 0.08% 1.691ms 97.74% 1.988s 6.628ms 300 - aten::t 0.07% 1.520ms 0.14% 2.945ms 9.817us 300 - aten::relu 0.06% 1.258ms 1.35% 27.382ms 136.910us 200 - ====================== ============ ============ ============ ============= ============ ============= - -**Self CPU time total:** 2.034s - - -As you can see above, switching to OpenBLAS doubled the performance **(2.034s vs 4.123s)** compared to the default MKLDNN backend configuration. This becomes significant for even smaller batch dimensions, for example, for a batch dimension of 10: - -.. code-block:: python - - X = torch.rand(10, 64, 64, device=device) - with torch.set_grad_enabled(False): - for _ in range(50): - model(X) #Warmup - with profile(activities=[ProfilerActivity.CPU]) as prof: - with record_function("mymodel_inference"): - for _ in range(100): - model(X) - - print(prof.key_averages().table(sort_by="self_cpu_time_total")) - - -The following is the profiler output with MKLDNN fast math mode: - -.. table:: - :widths: auto - - ====================== ============ ============ ============ ============ ============= ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ ============ ============ ============ ============= ============= - aten::addmm 87.81% 3.613s 91.90% 3.781s 12.604ms 300 - aten::clamp_min 7.18% 295.437ms 7.18% 295.437ms 1.477ms 200 - aten::copy 4.07% 167.516ms 4.07% 167.516ms 558.387us 300 - mymodel_inference 0.67% 27.708ms 100.00% 4.115s 4.115s 1 - aten::linear 0.06% 2.499ms 92.06% 3.788s 12.627ms 300 - aten::t 0.05% 1.982ms 0.11% 4.385ms 14.617us 300 - aten::relu 0.05% 1.932ms 7.23% 297.369ms 1.487ms 200 - ====================== ============ ============ ============ ============ ============= ============= - -**Self CPU time total:** 4.115s - - -and the following is the profiler output with the OpenBLAS backend: - -.. code-block:: bash - - $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64 - -.. table:: - :widths: auto - - ====================== ============= ============ ============ ============ ============= ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============= ============ ============ ============ ============= ============ - aten::addmm 92.66% 1.179s 95.23% 1.211s 4.038ms 300 - aten::clamp_min 2.83% 36.060ms 2.83% 36.060ms 180.300us 200 - aten::copy 2.52% 32.013ms 2.52% 32.013ms 106.710us 300 - mymodel_inference 1.38% 17.521ms 100.00% 1.272s 1.272s 1 - aten::linear 0.14% 1.750ms 95.60% 1.216s 4.054ms 300 - aten::t 0.12% 1.475ms 0.24% 3.033ms 10.110us 300 - aten::relu 0.10% 1.285ms 2.94% 37.345ms 186.725us 200 - ====================== ============= ============ ============ ============ ============= ============ - -**Self CPU time total:** 1.272s - - -Here we observed **3.2x (1.272s vs 4.115s)** performance improvement by tuning the backend thresholds appropriately. - - -Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP) ---------------------------------------------------------------------------- - -We also observed that for these larger networks, tensor memory allocations take significant portion of the inference latency. This can be optimized by enabling Linux transparent huge page allocations from PyTorch C10 memory allocator. Currently the feature is not enabled by default because it will increase the memory footprint marginally. Set the following environment variable to enable it: - -.. code-block:: bash - - $ export THP_MEM_ALLOC_ENABLE=1 - -For the batch dimension of 256 and with MKLDNN fast math mode: - -.. code-block:: python - - X = torch.rand(256, 64, 64, device=device) - with torch.set_grad_enabled(False): - for _ in range(50): - model(X) #Warmup - with profile(activities=[ProfilerActivity.CPU]) as prof: - with record_function("mymodel_inference"): - for _ in range(100): - model(X) - - print(prof.key_averages().table(sort_by="self_cpu_time_total")) - - -The following is the profiler output with THP memory allocations enabled: - -.. table:: - :widths: auto - - ====================== ============ ============ ============ ============ ============== ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls - ====================== ============ ============ ============ ============ ============== ============ - aten::addmm 91.31% 6.115s 94.39% 6.321s 21.069ms 300 - aten::clamp_min 4.82% 322.568ms 4.82% 322.568ms 1.613ms 200 - aten::copy 3.06% 204.602ms 3.06% 204.602ms 682.007us 300 - mymodel_inference 0.61% 40.777ms 100.00% 6.697s 6.697s 1 - aten::linear 0.05% 3.082ms 94.51% 6.329s 21.097ms 300 - aten::relu 0.04% 2.547ms 4.85% 325.115ms 1.626ms 200 - ====================== ============ ============ ============ ============ ============== ============ - -**Self CPU time total:** 6.697s - -This is an additional **1.08x or 8% (6.697s vs 7.262s)** improvement on top of the already optimized MKLDNN fast math mode measured above. - - -Conclusion ------------- - -In this tutorial, we covered PyTorch inference on AWS Graviton3 instances by covering the basic usage, demonstrating speedups with fast math kernels, comparing different backends for different batch dimensions, and how to optimize tensor memory allocation latencies with Linux transparent huge pages. The recommendation is to use MKLDNN backend with Bfloat16 fastmath mode and THP memory allocations for larger tensor shapes and to use OpenBLAS backend for smaller tensor shapes. We hope that you will give it a try! diff --git a/recipes_source/intel_extension_for_pytorch.rst b/recipes_source/intel_extension_for_pytorch.rst deleted file mode 100644 index 03416102d2b..00000000000 --- a/recipes_source/intel_extension_for_pytorch.rst +++ /dev/null @@ -1,749 +0,0 @@ -Intel® Extension for PyTorch* -============================= - -Intel® Extension for PyTorch* extends PyTorch* with up-to-date features -optimizations for an extra performance boost on Intel hardware. Optimizations -take advantage of AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and -Intel® Advanced Matrix Extensions (Intel® AMX) on Intel CPUs as well as Intel -X\ :sup:`e`\ Matrix Extensions (XMX) AI engines on Intel discrete GPUs. -Moreover, through PyTorch* `xpu` device, Intel® Extension for PyTorch* provides -easy GPU acceleration for Intel discrete GPUs with PyTorch*. - -Intel® Extension for PyTorch* has been released as an open–source project -at `Github `_. - -- Source code for CPU is available at `master branch `_. -- Source code for GPU is available at `xpu-master branch `_. - -Features --------- - -Intel® Extension for PyTorch* shares most of features for CPU and GPU. - -- **Ease-of-use Python API:** Intel® Extension for PyTorch* provides simple - frontend Python APIs and utilities for users to get performance optimizations - such as graph optimization and operator optimization with minor code changes. - Typically, only 2 to 3 clauses are required to be added to the original code. -- **Channels Last:** Comparing to the default NCHW memory format, channels_last - (NHWC) memory format could further accelerate convolutional neural networks. - In Intel® Extension for PyTorch*, NHWC memory format has been enabled for - most key CPU operators, though not all of them have been merged to PyTorch - master branch yet. They are expected to be fully landed in PyTorch upstream - soon. -- **Auto Mixed Precision (AMP):** Low precision data type BFloat16 has been - natively supported on the 3rd Generation Xeon scalable Servers (aka Cooper - Lake) with AVX512 instruction set and will be supported on the next - generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix - Extensions (Intel® AMX) instruction set with further boosted performance. The - support of Auto Mixed Precision (AMP) with BFloat16 for CPU and BFloat16 - optimization of operators have been massively enabled in Intel® Extension - for PyTorch*, and partially upstreamed to PyTorch master branch. Most of - these optimizations will be landed in PyTorch master through PRs that are - being submitted and reviewed. Auto Mixed Precision (AMP) with both BFloat16 - and Float16 have been enabled for Intel discrete GPUs. -- **Graph Optimization:** To optimize performance further with torchscript, - Intel® Extension for PyTorch* supports fusion of frequently used operator - patterns, like Conv2D+ReLU, Linear+ReLU, etc. The benefit of the fusions are - delivered to users in a transparent fashion. Detailed fusion patterns - supported can be found `here `_. - The graph optimization will be up-streamed to PyTorch with the introduction - of oneDNN Graph API. -- **Operator Optimization:** Intel® Extension for PyTorch* also optimizes - operators and implements several customized operators for performance. A few - ATen operators are replaced by their optimized counterparts in Intel® - Extension for PyTorch* via ATen registration mechanism. Moreover, some - customized operators are implemented for several popular topologies. For - instance, ROIAlign and NMS are defined in Mask R-CNN. To improve performance - of these topologies, Intel® Extension for PyTorch* also optimized these - customized operators. - -Getting Started ---------------- - -Minor code changes are required for users to get start with Intel® Extension -for PyTorch*. Both PyTorch imperative mode and TorchScript mode are -supported. This section introduces usage of Intel® Extension for PyTorch* API -functions for both imperative mode and TorchScript mode, covering data type -Float32 and BFloat16. C++ usage will also be introduced at the end. - -You just need to import Intel® Extension for PyTorch* package and apply its -optimize function against the model object. If it is a training workload, the -optimize function also needs to be applied against the optimizer object. - -For training and inference with BFloat16 data type, `torch.cpu.amp` has been -enabled in PyTorch upstream to support mixed precision with convenience. -BFloat16 datatype has been enabled excessively for CPU operators in PyTorch -upstream and Intel® Extension for PyTorch*. Meanwhile `torch.xpu.amp`, -registered by Intel® Extension for PyTorch*, enables easy usage of BFloat16 -and Float16 data types on Intel discrete GPUs. Either `torch.cpu.amp` or -`torch.xpu.amp` matches each operator to its appropriate datatype automatically -and returns the best possible performance. - -Examples -- CPU ---------------- - -This section shows examples of training and inference on CPU with Intel® -Extension for PyTorch* - -The code changes that are required for Intel® Extension for PyTorch* are -highlighted. - -Training -~~~~~~~~ - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision - import intel_extension_for_pytorch as ipex - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - model, optimizer = ipex.optimize(model, optimizer=optimizer) - - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint.pth') - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - import torchvision - import intel_extension_for_pytorch as ipex - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) - - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - with torch.cpu.amp.autocast(): - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint.pth') - -Inference - Imperative Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision.models as models - - model = models.resnet50(pretrained=True) - model.eval() - data = torch.rand(1, 3, 224, 224) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - with torch.no_grad(): - model(data) - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - from transformers import BertModel - - model = BertModel.from_pretrained(args.model_name) - model.eval() - - vocab_size = model.config.vocab_size - batch_size = 1 - seq_length = 512 - data = torch.randint(vocab_size, size=[batch_size, seq_length]) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model, dtype=torch.bfloat16) - ###################################################### - - with torch.no_grad(): - with torch.cpu.amp.autocast(): - model(data) - -Inference - TorchScript Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TorchScript mode makes graph optimization possible, hence improves -performance for some topologies. Intel® Extension for PyTorch* enables most -commonly used operator pattern fusion, and users can get the performance -benefit without additional code changes. - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision.models as models - - model = models.resnet50(pretrained=True) - model.eval() - data = torch.rand(1, 3, 224, 224) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) - ###################################################### - - with torch.no_grad(): - d = torch.rand(1, 3, 224, 224) - model = torch.jit.trace(model, d) - model = torch.jit.freeze(model) - - model(data) - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - from transformers import BertModel - - model = BertModel.from_pretrained(args.model_name) - model.eval() - - vocab_size = model.config.vocab_size - batch_size = 1 - seq_length = 512 - data = torch.randint(vocab_size, size=[batch_size, seq_length]) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - model = ipex.optimize(model, dtype=torch.bfloat16) - ###################################################### - - with torch.no_grad(): - with torch.cpu.amp.autocast(): - d = torch.randint(vocab_size, size=[batch_size, seq_length]) - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - model = torch.jit.freeze(model) - - model(data) - -Examples -- GPU ---------------- - -This section shows examples of training and inference on GPU with Intel® -Extension for PyTorch* - -The code changes that are required for Intel® Extension for PyTorch* are -highlighted with comments in a line above. - -Training -~~~~~~~~ - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - #################################### code changes ################################ - model = model.to("xpu") - model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.float32) - #################################### code changes ################################ - - for batch_idx, (data, target) in enumerate(train_loader): - ########## code changes ########## - data = data.to("xpu") - target = target.to("xpu") - ########## code changes ########## - optimizer.zero_grad() - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint.pth') - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - import torchvision - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - ##################################### code changes ################################ - model = model.to("xpu") - model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) - ##################################### code changes ################################ - - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - ######################### code changes ######################### - data = data.to("xpu") - target = target.to("xpu") - with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - ######################### code changes ######################### - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint.pth') - -Inference - Imperative Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision.models as models - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = models.resnet50(pretrained=True) - model.eval() - data = torch.rand(1, 3, 224, 224) - - model = model.to(memory_format=torch.channels_last) - data = data.to(memory_format=torch.channels_last) - - #################### code changes ################ - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.float32) - #################### code changes ################ - - with torch.no_grad(): - model(data) - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - import torchvision.models as models - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = models.resnet50(pretrained=True) - model.eval() - data = torch.rand(1, 3, 224, 224) - - model = model.to(memory_format=torch.channels_last) - data = data.to(memory_format=torch.channels_last) - - #################### code changes ################# - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.bfloat16) - #################### code changes ################# - - with torch.no_grad(): - ################################# code changes ###################################### - with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=False): - ################################# code changes ###################################### - model(data) - -Float16 -^^^^^^^ - -.. code:: python3 - - import torch - import torchvision.models as models - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = models.resnet50(pretrained=True) - model.eval() - data = torch.rand(1, 3, 224, 224) - - model = model.to(memory_format=torch.channels_last) - data = data.to(memory_format=torch.channels_last) - - #################### code changes ################ - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.float16) - #################### code changes ################ - - with torch.no_grad(): - ################################# code changes ###################################### - with torch.xpu.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=False): - ################################# code changes ###################################### - model(data) - -Inference - TorchScript Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TorchScript mode makes graph optimization possible, hence improves -performance for some topologies. Intel® Extension for PyTorch* enables most -commonly used operator pattern fusion, and users can get the performance -benefit without additional code changes. - -Float32 -^^^^^^^ - -.. code:: python3 - - import torch - from transformers import BertModel - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = BertModel.from_pretrained(args.model_name) - model.eval() - - vocab_size = model.config.vocab_size - batch_size = 1 - seq_length = 512 - data = torch.randint(vocab_size, size=[batch_size, seq_length]) - - #################### code changes ################ - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.float32) - #################### code changes ################ - - with torch.no_grad(): - d = torch.randint(vocab_size, size=[batch_size, seq_length]) - ##### code changes ##### - d = d.to("xpu") - ##### code changes ##### - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - model = torch.jit.freeze(model) - - model(data) - -BFloat16 -^^^^^^^^ - -.. code:: python3 - - import torch - from transformers import BertModel - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = BertModel.from_pretrained(args.model_name) - model.eval() - - vocab_size = model.config.vocab_size - batch_size = 1 - seq_length = 512 - data = torch.randint(vocab_size, size=[batch_size, seq_length]) - - #################### code changes ################# - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.bfloat16) - #################### code changes ################# - - with torch.no_grad(): - d = torch.randint(vocab_size, size=[batch_size, seq_length]) - ################################# code changes ###################################### - d = d.to("xpu") - with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=False): - ################################# code changes ###################################### - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - model = torch.jit.freeze(model) - - model(data) - -Float16 -^^^^^^^ - -.. code:: python3 - - import torch - from transformers import BertModel - ############# code changes ############### - import intel_extension_for_pytorch as ipex - ############# code changes ############### - - model = BertModel.from_pretrained(args.model_name) - model.eval() - - vocab_size = model.config.vocab_size - batch_size = 1 - seq_length = 512 - data = torch.randint(vocab_size, size=[batch_size, seq_length]) - - #################### code changes ################ - model = model.to("xpu") - data = data.to("xpu") - model = ipex.optimize(model, dtype=torch.float16) - #################### code changes ################ - - with torch.no_grad(): - d = torch.randint(vocab_size, size=[batch_size, seq_length]) - ################################# code changes ###################################### - d = d.to("xpu") - with torch.xpu.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=False): - ################################# code changes ###################################### - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - model = torch.jit.freeze(model) - - model(data) - -C++ (CPU only) -~~~~~~~~~~~~~~ - -To work with libtorch, C++ library of PyTorch, Intel® Extension for PyTorch* -provides its C++ dynamic library as well. The C++ library is supposed to handle -inference workload only, such as service deployment. For regular development, -please use Python interface. Comparing to usage of libtorch, no specific code -changes are required, except for converting input data into channels last data -format. Compilation follows the recommended methodology with CMake. Detailed -instructions can be found in `PyTorch tutorial `_. -During compilation, Intel optimizations will be activated automatically -once C++ dynamic library of Intel® Extension for PyTorch* is linked. - -**example-app.cpp** - -.. code:: cpp - - #include - #include - #include - - int main(int argc, const char* argv[]) { - torch::jit::script::Module module; - try { - module = torch::jit::load(argv[1]); - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - std::vector inputs; - // make sure input data are converted to channels last format - inputs.push_back(torch::ones({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast)); - - at::Tensor output = module.forward(inputs).toTensor(); - - return 0; - } - -**CMakeLists.txt** - -:: - - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(example-app) - - find_package(intel_ext_pt_cpu REQUIRED) - - add_executable(example-app example-app.cpp) - target_link_libraries(example-app "${TORCH_LIBRARIES}") - - set_property(TARGET example-app PROPERTY CXX_STANDARD 14) - -**Command for compilation** - -:: - - $ cmake -DCMAKE_PREFIX_PATH= .. - $ make - -If `Found INTEL_EXT_PT_CPU` is shown as `TRUE`, the extension had been linked -into the binary. This can be verified with the Linux command `ldd`. - -:: - - $ cmake -DCMAKE_PREFIX_PATH=/workspace/libtorch .. - -- The C compiler identification is GNU 9.3.0 - -- The CXX compiler identification is GNU 9.3.0 - -- Check for working C compiler: /usr/bin/cc - -- Check for working C compiler: /usr/bin/cc -- works - -- Detecting C compiler ABI info - -- Detecting C compiler ABI info - done - -- Detecting C compile features - -- Detecting C compile features - done - -- Check for working CXX compiler: /usr/bin/c++ - -- Check for working CXX compiler: /usr/bin/c++ -- works - -- Detecting CXX compiler ABI info - -- Detecting CXX compiler ABI info - done - -- Detecting CXX compile features - -- Detecting CXX compile features - done - -- Looking for pthread.h - -- Looking for pthread.h - found - -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed - -- Looking for pthread_create in pthreads - -- Looking for pthread_create in pthreads - not found - -- Looking for pthread_create in pthread - -- Looking for pthread_create in pthread - found - -- Found Threads: TRUE - -- Found Torch: /workspace/libtorch/lib/libtorch.so - -- Found INTEL_EXT_PT_CPU: TRUE - -- Configuring done - -- Generating done - -- Build files have been written to: /workspace/build - - $ ldd example-app - ... - libtorch.so => /workspace/libtorch/lib/libtorch.so (0x00007f3cf98e0000) - libc10.so => /workspace/libtorch/lib/libc10.so (0x00007f3cf985a000) - libintel-ext-pt-cpu.so => /workspace/libtorch/lib/libintel-ext-pt-cpu.so (0x00007f3cf70fc000) - libtorch_cpu.so => /workspace/libtorch/lib/libtorch_cpu.so (0x00007f3ce16ac000) - ... - libdnnl_graph.so.0 => /workspace/libtorch/lib/libdnnl_graph.so.0 (0x00007f3cde954000) - ... - -Model Zoo (CPU only) --------------------- - -Use cases that had already been optimized by Intel engineers are available at -`Model Zoo for Intel® Architecture `_ (with -the branch name in format of `pytorch-r-models`). Many PyTorch use -cases for benchmarking are also available on the GitHub page. You can get -performance benefits out-of-the-box by simply running scripts in the Model Zoo. - -Tutorials ---------- - -More detailed tutorials are available in the official Intel® Extension -for PyTorch* Documentation: - -- `CPU `_ -- `GPU `_ diff --git a/recipes_source/intel_neural_compressor_for_pytorch.rst b/recipes_source/intel_neural_compressor_for_pytorch.rst index 67f1a7f333e..ee569382343 100755 --- a/recipes_source/intel_neural_compressor_for_pytorch.rst +++ b/recipes_source/intel_neural_compressor_for_pytorch.rst @@ -4,37 +4,20 @@ Ease-of-use quantization for PyTorch with Intel® Neural Compressor Overview -------- -Most deep learning applications are using 32-bits of floating-point precision -for inference. But low precision data types, especially int8, are getting more -focus due to significant performance boost. One of the essential concerns on -adopting low precision is how to easily mitigate the possible accuracy loss -and reach predefined accuracy requirement. +Most deep learning applications are using 32-bits of floating-point precision for inference. But low precision data types, such as fp8, are getting more focus due to significant performance boost. A key concern in adopting low precision is mitigating accuracy loss while meeting predefined requirements. -Intel® Neural Compressor aims to address the aforementioned concern by extending -PyTorch with accuracy-driven automatic tuning strategies to help user quickly find -out the best quantized model on Intel hardware, including Intel Deep Learning -Boost (`Intel DL Boost `_) -and Intel Advanced Matrix Extensions (`Intel AMX `_). +Intel® Neural Compressor aims to address the aforementioned concern by extending PyTorch with accuracy-driven automatic tuning strategies to help user quickly find out the best quantized model on Intel hardware. -Intel® Neural Compressor has been released as an open-source project -at `Github `_. +Intel® Neural Compressor is an open-source project at `Github `_. Features -------- -- **Ease-of-use Python API:** Intel® Neural Compressor provides simple frontend - Python APIs and utilities for users to do neural network compression with few - line code changes. - Typically, only 5 to 6 clauses are required to be added to the original code. +- **Ease-of-use API:** Intel® Neural Compressor is re-using the PyTorch ``prepare``, ``convert`` API for user usage. -- **Quantization:** Intel® Neural Compressor supports accuracy-driven automatic - tuning process on post-training static quantization, post-training dynamic - quantization, and quantization-aware training on PyTorch fx graph mode and - eager model. +- **Accuracy-driven Tuning:** Intel® Neural Compressor supports accuracy-driven automatic tuning process, provides ``autotune`` API for user usage. -*This tutorial mainly focuses on the quantization part. As for how to use Intel® -Neural Compressor to do pruning and distillation, please refer to corresponding -documents in the Intel® Neural Compressor github repo.* +- **Kinds of Quantization:** Intel® Neural Compressor supports a variety of quantization methods, including classic INT8 quantization, weight-only quantization and the popular FP8 quantization. Neural compressor also provides the latest research in simulation work, such as MX data type emulation quantization. For more details, please refer to `Supported Matrix `_. Getting Started --------------- @@ -45,329 +28,134 @@ Installation .. code:: bash # install stable version from pip - pip install neural-compressor + pip install neural-compressor-pt +.. - # install nightly version from pip - pip install -i https://test.pypi.org/simple/ neural-compressor +**Note**: Neural Compressor provides automatic accelerator detection, including HPU, Intel GPU, CUDA, and CPU. To specify the target device, ``INC_TARGET_DEVICE`` is suggested, e.g., ``export INC_TARGET_DEVICE=cpu``. - # install stable version from from conda - conda install neural-compressor -c conda-forge -c intel -*Supported python versions are 3.6 or 3.7 or 3.8 or 3.9* +Examples +~~~~~~~~~~~~ + +This section shows examples of kinds of quantization with Intel® Neural compressor -Usages -~~~~~~ +FP8 Quantization +^^^^^^^^^^^^^^^^ -Minor code changes are required for users to get started with Intel® Neural Compressor -quantization API. Both PyTorch fx graph mode and eager mode are supported. +**FP8 Quantization** is supported by Intel® Gaudi®2&3 AI Accelerator (HPU). To prepare the environment, please refer to `Intel® Gaudi® Documentation `_. -Intel® Neural Compressor takes a FP32 model and a yaml configuration file as inputs. -To construct the quantization process, users can either specify the below settings via -the yaml configuration file or python APIs: +Run the example, -1. Calibration Dataloader (Needed for static quantization) -2. Evaluation Dataloader -3. Evaluation Metric +.. code-block:: python -Intel® Neural Compressor supports some popular dataloaders and evaluation metrics. For -how to configure them in yaml configuration file, user could refer to `Built-in Datasets -`_. + # FP8 Quantization Example + from neural_compressor.torch.quantization import ( + FP8Config, + prepare, + convert, + ) -If users want to use a self-developed dataloader or evaluation metric, Intel® Neural -Compressor supports this by the registration of customized dataloader/metric using python code. + import torch + import torchvision.models as models -For the yaml configuration file format please refer to `yaml template -`_. + # Load a pre-trained ResNet18 model + model = models.resnet18() -The code changes that are required for *Intel® Neural Compressor* are highlighted with -comments in the line above. + # Configure FP8 quantization + qconfig = FP8Config(fp8_config="E4M3") + model = prepare(model, qconfig) -Model -^^^^^ + # Perform calibration (replace with actual calibration data) + calibration_data = torch.randn(1, 3, 224, 224).to("hpu") + model(calibration_data) -In this tutorial, the LeNet model is used to demonstrate how to deal with *Intel® Neural Compressor*. + # Convert the model to FP8 + model = convert(model) -.. code-block:: python3 + # Perform inference + input_data = torch.randn(1, 3, 224, 224).to("hpu") + output = model(input_data).to("cpu") + print(output) - # main.py - import torch - import torch.nn as nn - import torch.nn.functional as F - - # LeNet Model definition - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc1_drop = nn.Dropout() - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.reshape(-1, 320) - x = F.relu(self.fc1(x)) - x = self.fc1_drop(x) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - model = Net() - model.load_state_dict(torch.load('./lenet_mnist_model.pth')) - -The pretrained model weight `lenet_mnist_model.pth` comes from -`here `_. - -Accuracy driven quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Intel® Neural Compressor supports accuracy-driven automatic tuning to generate the optimal -int8 model which meets a predefined accuracy goal. - -Below is an example of how to quantize a simple network on PyTorch -`FX graph mode `_ by auto-tuning. - -.. code-block:: yaml - - # conf.yaml - model: - name: LeNet - framework: pytorch_fx - - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - -.. code-block:: python3 - - # main.py - model.eval() - - from torchvision import datasets, transforms - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('./data', train=False, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - ])), - batch_size=1) - - # launch code for Intel® Neural Compressor - from neural_compressor.experimental import Quantization - quantizer = Quantization("./conf.yaml") - quantizer.model = model - quantizer.calib_dataloader = test_loader - quantizer.eval_dataloader = test_loader - q_model = quantizer() - q_model.save('./output') - -In the `conf.yaml` file, the built-in metric `top1` of Intel® Neural Compressor is specified as -the evaluation method, and `1%` relative accuracy loss is set as the accuracy target for auto-tuning. -Intel® Neural Compressor will traverse all possible quantization config combinations on per-op level -to find out the optimal int8 model that reaches the predefined accuracy target. - -Besides those built-in metrics, Intel® Neural Compressor also supports customized metric through -python code: - -.. code-block:: yaml - - # conf.yaml - model: - name: LeNet - framework: pytorch_fx - - tuning: - accuracy_criterion: - relative: 0.01 - -.. code-block:: python3 - - # main.py - model.eval() - - from torchvision import datasets, transforms - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('./data', train=False, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - ])), - batch_size=1) - - # define a customized metric - class Top1Metric(object): - def __init__(self): - self.correct = 0 - def update(self, output, label): - pred = output.argmax(dim=1, keepdim=True) - self.correct += pred.eq(label.view_as(pred)).sum().item() - def reset(self): - self.correct = 0 - def result(self): - return 100. * self.correct / len(test_loader.dataset) - - # launch code for Intel® Neural Compressor - from neural_compressor.experimental import Quantization - quantizer = Quantization("./conf.yaml") - quantizer.model = model - quantizer.calib_dataloader = test_loader - quantizer.eval_dataloader = test_loader - quantizer.metric = Top1Metric() - q_model = quantizer() - q_model.save('./output') - -In the above example, a `class` which contains `update()` and `result()` function is implemented -to record per mini-batch result and calculate final accuracy at the end. - -Quantization aware training -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Besides post-training static quantization and post-training dynamic quantization, Intel® Neural -Compressor supports quantization-aware training with an accuracy-driven automatic tuning mechanism. - -Below is an example of how to do quantization aware training on a simple network on PyTorch -`FX graph mode `_. - -.. code-block:: yaml - - # conf.yaml - model: - name: LeNet - framework: pytorch_fx - - quantization: - approach: quant_aware_training - - evaluation: - accuracy: - metric: - topk: 1 - - tuning: - accuracy_criterion: - relative: 0.01 - -.. code-block:: python3 - - # main.py - model.eval() - - from torchvision import datasets, transforms - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('./data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=64, shuffle=True) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('./data', train=False, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=1) - - import torch.optim as optim - optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.1) - - def training_func(model): - model.train() - for epoch in range(1, 3): - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) - - # launch code for Intel® Neural Compressor - from neural_compressor.experimental import Quantization - quantizer = Quantization("./conf.yaml") - quantizer.model = model - quantizer.q_func = training_func - quantizer.eval_dataloader = test_loader - q_model = quantizer() - q_model.save('./output') - -Performance only quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Intel® Neural Compressor supports directly yielding int8 model with dummy dataset for the -performance benchmarking purpose. - -Below is an example of how to quantize a simple network on PyTorch -`FX graph mode `_ with a dummy dataset. - -.. code-block:: yaml - - # conf.yaml - model: - name: lenet - framework: pytorch_fx - -.. code-block:: python3 - - # main.py - model.eval() - - # launch code for Intel® Neural Compressor - from neural_compressor.experimental import Quantization, common - from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset - quantizer = Quantization("./conf.yaml") - quantizer.model = model - quantizer.calib_dataloader = common.DataLoader(DummyDataset([(1, 1, 28, 28)])) - q_model = quantizer() - q_model.save('./output') - -Quantization outputs -~~~~~~~~~~~~~~~~~~~~ - -Users could know how many ops get quantized from log printed by Intel® Neural Compressor -like below: - -:: - - 2021-12-08 14:58:35 [INFO] |********Mixed Precision Statistics*******| - 2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+ - 2021-12-08 14:58:35 [INFO] | Op Type | Total | INT8 | - 2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+ - 2021-12-08 14:58:35 [INFO] | quantize_per_tensor | 2 | 2 | - 2021-12-08 14:58:35 [INFO] | Conv2d | 2 | 2 | - 2021-12-08 14:58:35 [INFO] | max_pool2d | 1 | 1 | - 2021-12-08 14:58:35 [INFO] | relu | 1 | 1 | - 2021-12-08 14:58:35 [INFO] | dequantize | 2 | 2 | - 2021-12-08 14:58:35 [INFO] | LinearReLU | 1 | 1 | - 2021-12-08 14:58:35 [INFO] | Linear | 1 | 1 | - 2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+ - -The quantized model will be generated under `./output` directory, in which there are two files: -1. best_configure.yaml -2. best_model_weights.pt - -The first file contains the quantization configurations of each op, the second file contains -int8 weights and zero point and scale info of activations. - -Deployment -~~~~~~~~~~ - -Users could use the below code to load quantized model and then do inference or performance benchmark. - -.. code-block:: python3 - - from neural_compressor.utils.pytorch import load - int8_model = load('./output', model) +.. + +Weight-only Quantization +^^^^^^^^^^^^^^^^^^^^^^^^ + +**Weight-only Quantization** is also supported on Intel® Gaudi®2&3 AI Accelerator. The quantized model could be loaded as below. + +.. code-block:: python + + from neural_compressor.torch.quantization import load + + # The model name comes from HuggingFace Model Hub. + model_name = "TheBloke/Llama-2-7B-GPTQ" + model = load( + model_name_or_path=model_name, + format="huggingface", + device="hpu", + torch_dtype=torch.bfloat16, + ) +.. + +**Note:** Intel Neural Compressor will convert the model format from auto-gptq to hpu format on the first load and save hpu_model.safetensors to the local cache directory for the next load. So it may take a while to load for the first time. + +Static Quantization with PT2E Backend +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The PT2E path uses ``torch.dynamo`` to capture the eager model into an FX graph model, and then inserts the observers and Q/QD pairs on it. Finally it uses the ``torch.compile`` to perform the pattern matching and replace the Q/DQ pairs with optimized quantized operators. + +There are four steps to perform W8A8 static quantization with PT2E backend: ``export``, ``prepare``, ``convert`` and ``compile``. + +.. code-block:: python + + import torch + from neural_compressor.torch.export import export + from neural_compressor.torch.quantization import StaticQuantConfig, prepare, convert + + # Prepare the float model and example inputs for export model + model = UserFloatModel() + example_inputs = ... + + # Export eager model into FX graph model + exported_model = export(model=model, example_inputs=example_inputs) + # Quantize the model + quant_config = StaticQuantConfig() + prepared_model = prepare(exported_model, quant_config=quant_config) + # Calibrate + run_fn(prepared_model) + q_model = convert(prepared_model) + # Compile the quantized model and replace the Q/DQ pattern with Q-operator + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(q_model) +.. + +Accuracy-driven Tuning +^^^^^^^^^^^^^^^^^^^^^^ + +To leverage accuracy-driven automatic tuning, a specified tuning space is necessary. The ``autotune`` iterates the tuning space and applies the configuration on given high-precision model then records and compares its evaluation result with the baseline. The tuning process stops when meeting the exit policy. + + +.. code-block:: python + + from neural_compressor.torch.quantization import RTNConfig, TuningConfig, autotune + + + def eval_fn(model) -> float: + return ... + + + tune_config = TuningConfig( + config_set=RTNConfig(use_sym=[False, True], group_size=[32, 128]), + tolerable_loss=0.2, + max_trials=10, + ) + q_model = autotune(model, tune_config=tune_config, eval_fn=eval_fn) +.. Tutorials --------- -Please visit `Intel® Neural Compressor Github repo `_ -for more tutorials. +More detailed tutorials are available in the official Intel® Neural Compressor `doc `_. diff --git a/recipes_source/mobile_interpreter.rst b/recipes_source/mobile_interpreter.rst index dda1dd92435..e6d2056e1a6 100644 --- a/recipes_source/mobile_interpreter.rst +++ b/recipes_source/mobile_interpreter.rst @@ -1,198 +1,10 @@ (beta) Efficient mobile interpreter in Android and iOS ================================================================== -**Author**: `Chen Lai `_, `Martin Yuan `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------- +Redirecting in 3 seconds... -This tutorial introduces the steps to use PyTorch's efficient interpreter on iOS and Android. We will be using an Image Segmentation demo application as an example. +.. raw:: html -This application will take advantage of the pre-built interpreter libraries available for Android and iOS, which can be used directly with Maven (Android) and CocoaPods (iOS). It is important to note that the pre-built libraries are the available for simplicity, but further size optimization can be achieved with by utilizing PyTorch's custom build capabilities. - -.. note:: If you see the error message: `PytorchStreamReader failed locating file bytecode.pkl: file not found ()`, likely you are using a torch script model that requires the use of the PyTorch JIT interpreter (a version of our PyTorch interpreter that is not as size-efficient). In order to leverage our efficient interpreter, please regenerate the model by running: `module._save_for_lite_interpreter(${model_path})`. - - - If `bytecode.pkl` is missing, likely the model is generated with the api: `module.save(${model_psth})`. - - The api `_load_for_lite_interpreter(${model_psth})` can be helpful to validate model with the efficient mobile interpreter. - -Android -------------------- -Get the Image Segmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation - -1. **Prepare model**: Prepare the mobile interpreter version of model by run the script below to generate the scripted model `deeplabv3_scripted.pt` and `deeplabv3_scripted.ptl` - -.. code:: python - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scripted_module = torch.jit.script(model) - # Export full jit version model (not compatible mobile interpreter), leave it here for comparison - scripted_module.save("deeplabv3_scripted.pt") - # Export mobile interpreter version model (compatible with mobile interpreter) - optimized_scripted_module = optimize_for_mobile(scripted_module) - optimized_scripted_module._save_for_lite_interpreter("deeplabv3_scripted.ptl") - -2. **Use the PyTorch Android library in the ImageSegmentation app**: Update the `dependencies` part of ``ImageSegmentation/app/build.gradle`` to - -.. code:: gradle - - repositories { - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } - } - - dependencies { - implementation 'androidx.appcompat:appcompat:1.2.0' - implementation 'androidx.constraintlayout:constraintlayout:2.0.2' - testImplementation 'junit:junit:4.12' - androidTestImplementation 'androidx.test.ext:junit:1.1.2' - androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' - implementation 'org.pytorch:pytorch_android_lite:1.9.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.9.0' - - implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' - } - - - -3. **Update model loader api**: Update ``ImageSegmentation/app/src/main/java/org/pytorch/imagesegmentation/MainActivity.java`` by - - 4.1 Add new import: `import org.pytorch.LiteModuleLoader` - - 4.2 Replace the way to load pytorch lite model - -.. code:: java - - // mModule = Module.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.pt")); - mModule = LiteModuleLoader.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.ptl")); - -4. **Test app**: Build and run the `ImageSegmentation` app in Android Studio - -iOS -------------------- -Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation - -1. **Prepare model**: Same as Android. - -2. **Build the project with Cocoapods and prebuilt interpreter** Update the `PodFile` and run `pod install`: - -.. code-block:: podfile - - target 'ImageSegmentation' do - # Comment the next line if you don't want to use dynamic frameworks - use_frameworks! - - # Pods for ImageSegmentation - pod 'LibTorch_Lite', '~>1.9.0' - end - -3. **Update library and API** - - 3.1 Update ``TorchModule.mm``: To use the custom built libraries project, use `` (in ``TorchModule.mm``): - -.. code-block:: swift - - #import - // If it's built from source with xcode, comment out the line above - // and use following headers - // #include - // #include - // #include - -.. code-block:: swift - - @implementation TorchModule { - @protected - // torch::jit::script::Module _impl; - torch::jit::mobile::Module _impl; - } - - - (nullable instancetype)initWithFileAtPath:(NSString*)filePath { - self = [super init]; - if (self) { - try { - _impl = torch::jit::_load_for_mobile(filePath.UTF8String); - // _impl = torch::jit::load(filePath.UTF8String); - // _impl.eval(); - } catch (const std::exception& exception) { - NSLog(@"%s", exception.what()); - return nil; - } - } - return self; - } - -3.2 Update ``ViewController.swift`` - -.. code-block:: swift - - // if let filePath = Bundle.main.path(forResource: - // "deeplabv3_scripted", ofType: "pt"), - // let module = TorchModule(fileAtPath: filePath) { - // return module - // } else { - // fatalError("Can't find the model file!") - // } - if let filePath = Bundle.main.path(forResource: - "deeplabv3_scripted", ofType: "ptl"), - let module = TorchModule(fileAtPath: filePath) { - return module - } else { - fatalError("Can't find the model file!") - } - -4. Build and test the app in Xcode. - -How to use mobile interpreter + custom build ---------------------------------------------- -A custom PyTorch interpreter library can be created to reduce binary size, by only containing the operators needed by the model. In order to do that follow these steps: - -1. To dump the operators in your model, say `deeplabv3_scripted`, run the following lines of Python code: - -.. code-block:: python - - # Dump list of operators used by deeplabv3_scripted: - import torch, yaml - model = torch.jit.load('deeplabv3_scripted.ptl') - ops = torch.jit.export_opnames(model) - with open('deeplabv3_scripted.yaml', 'w') as output: - yaml.dump(ops, output) - -In the snippet above, you first need to load the ScriptModule. Then, use export_opnames to return a list of operator names of the ScriptModule and its submodules. Lastly, save the result in a yaml file. The yaml file can be generated for any PyTorch 1.4.0 or above version. You can do that by checking the value of `torch.__version__`. - -2. To run the build script locally with the prepared yaml list of operators, pass in the yaml file generate from the last step into the environment variable SELECTED_OP_LIST. Also in the arguments, specify BUILD_PYTORCH_MOBILE=1 as well as the platform/architechture type. - -**iOS**: Take the simulator build for example, the command should be: - -.. code-block:: bash - - SELECTED_OP_LIST=deeplabv3_scripted.yaml BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR ./scripts/build_ios.sh - -**Android**: Take the x86 build for example, the command should be: - -.. code-block:: bash - - SELECTED_OP_LIST=deeplabv3_scripted.yaml ./scripts/build_pytorch_android.sh x86 - - - -Conclusion ----------- - -In this tutorial, we demonstrated how to use PyTorch's efficient mobile interpreter, in an Android and iOS app. - -We walked through an Image Segmentation example to show how to dump the model, build a custom torch library from source and use the new api to run model. - -Our efficient mobile interpreter is still under development, and we will continue improving its size in the future. Note, however, that the APIs are subject to change in future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here ` - if you have any. - -Learn More ----------- - -- To learn more about PyTorch Mobile, please refer to `PyTorch Mobile Home Page `_ -- To learn more about Image Segmentation, please refer to the `Image Segmentation DeepLabV3 on Android Recipe `_ + diff --git a/recipes_source/mobile_perf.rst b/recipes_source/mobile_perf.rst index aae1447cbf8..8835ddecc6d 100644 --- a/recipes_source/mobile_perf.rst +++ b/recipes_source/mobile_perf.rst @@ -1,356 +1,10 @@ Pytorch Mobile Performance Recipes ================================== -Introduction ----------------- -Performance (aka latency) is crucial to most, if not all, -applications and use-cases of ML model inference on mobile devices. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Today, PyTorch executes the models on the CPU backend pending availability -of other hardware backends such as GPU, DSP, and NPU. +Redirecting in 3 seconds... -In this recipe, you will learn: +.. raw:: html -- How to optimize your model to help decrease execution time (higher performance, lower latency) on the mobile device. -- How to benchmark (to check if optimizations helped your use case). - - -Model preparation ------------------ - -We will start with preparing to optimize your model to help decrease execution time -(higher performance, lower latency) on the mobile device. - - -Setup -^^^^^^^ - -First we need to installed pytorch using conda or pip with version at least 1.5.0. - -:: - - conda install pytorch torchvision -c pytorch - -or - -:: - - pip install torch torchvision - -Code your model: - -:: - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - - class AnnotatedConvBnReLUModel(torch.nn.Module): - def __init__(self): - super(AnnotatedConvBnReLUModel, self).__init__() - self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float) - self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float) - self.relu = torch.nn.ReLU(inplace=True) - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - x = self.quant(x) - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - x = self.dequant(x) - return x - - model = AnnotatedConvBnReLUModel() - - -``torch.quantization.QuantStub`` and ``torch.quantization.DeQuantStub()`` are no-op stubs, which will be used for quantization step. - - -1. Fuse operators using ``torch.quantization.fuse_modules`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Do not be confused that fuse_modules is in the quantization package. -It works for all ``torch.nn.Module``. - -``torch.quantization.fuse_modules`` fuses a list of modules into a single module. -It fuses only the following sequence of modules: - -- Convolution, Batch normalization -- Convolution, Batch normalization, Relu -- Convolution, Relu -- Linear, Relu - -This script will fuse Convolution, Batch Normalization and Relu in previously declared model. - -:: - - torch.quantization.fuse_modules(model, [['conv', 'bn', 'relu']], inplace=True) - - -2. Quantize your model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can find more about PyTorch quantization in -`the dedicated tutorial `_. - -Quantization of the model not only moves computation to int8, -but also reduces the size of your model on a disk. -That size reduction helps to reduce disk read operations during the first load of the model and decreases the amount of RAM. -Both of those resources can be crucial for the performance of mobile applications. -This code does quantization, using stub for model calibration function, you can find more about it `here `__. - -:: - - model.qconfig = torch.quantization.get_default_qconfig('qnnpack') - torch.quantization.prepare(model, inplace=True) - # Calibrate your model - def calibrate(model, calibration_data): - # Your calibration code here - return - calibrate(model, []) - torch.quantization.convert(model, inplace=True) - - - -3. Use torch.utils.mobile_optimizer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Torch mobile_optimizer package does several optimizations with the scripted model, -which will help to conv2d and linear operations. -It pre-packs model weights in an optimized format and fuses ops above with relu -if it is the next operation. - -First we script the result model from previous step: - -:: - - torchscript_model = torch.jit.script(model) - -Next we call ``optimize_for_mobile`` and save model on the disk. - -:: - - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "model.pt") - -4. Prefer Using Channels Last Tensor memory format -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Channels Last(NHWC) memory format was introduced in PyTorch 1.4.0. It is supported only for four-dimensional tensors. This memory format gives a better memory locality for most operators, especially convolution. Our measurements showed a 3x speedup of MobileNetV2 model compared with the default Channels First(NCHW) format. - -At the moment of writing this recipe, PyTorch Android java API does not support using inputs in Channels Last memory format. But it can be used on the TorchScript model level, by adding the conversion to it for model inputs. - -.. code-block:: python - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - ... - - -This conversion is zero cost if your input is already in Channels Last memory format. After it, all operators will work preserving ChannelsLast memory format. - -5. Android - Reusing tensors for forward -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This part of the recipe is Android only. - -Memory is a critical resource for android performance, especially on old devices. -Tensors can need a significant amount of memory. -For example, standard computer vision tensor contains 1*3*224*224 elements, -assuming that data type is float and will need 588Kb of memory. - -:: - - FloatBuffer buffer = Tensor.allocateFloatBuffer(1*3*224*224); - Tensor tensor = Tensor.fromBlob(buffer, new long[]{1, 3, 224, 224}); - - -Here we allocate native memory as ``java.nio.FloatBuffer`` and creating ``org.pytorch.Tensor`` which storage will be pointing to the memory of the allocated buffer. - -For most of the use cases, we do not do model forward only once, repeating it with some frequency or as fast as possible. - -If we are doing new memory allocation for every module forward - that will be suboptimal. -Instead of this, we can reuse the same memory that we allocated on the previous step, fill it with new data, and run module forward again on the same tensor object. - -You can check how it looks in code in `pytorch android application example `_. - -:: - - protected AnalysisResult analyzeImage(ImageProxy image, int rotationDegrees) { - if (mModule == null) { - mModule = Module.load(moduleFileAbsoluteFilePath); - mInputTensorBuffer = - Tensor.allocateFloatBuffer(3 * 224 * 224); - mInputTensor = Tensor.fromBlob(mInputTensorBuffer, new long[]{1, 3, 224, 224}); - } - - TensorImageUtils.imageYUV420CenterCropToFloatBuffer( - image.getImage(), rotationDegrees, - 224, 224, - TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, - TensorImageUtils.TORCHVISION_NORM_STD_RGB, - mInputTensorBuffer, 0); - - Tensor outputTensor = mModule.forward(IValue.from(mInputTensor)).toTensor(); - } - -Member fields ``mModule``, ``mInputTensorBuffer`` and ``mInputTensor`` are initialized only once -and buffer is refilled using ``org.pytorch.torchvision.TensorImageUtils.imageYUV420CenterCropToFloatBuffer``. - -6. Load time optimization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -**Available since Pytorch 1.13** - -PyTorch Mobile also supports a FlatBuffer-based file format that is faster -to load. Both flatbuffer and pickle-based model file can be load with the -same ``_load_for_lite_interpreter`` (Python) or ``_load_for_mobile``(C++) API. - -To use the FlatBuffer format, instead of creating the model file with -``model._save_for_lite_interpreter('path/to/file.ptl')``, you can run the following command: - - -One can save using - -:: - - model._save_for_lite_interpreter('path/to/file.ptl', _use_flatbuffer=True) - - -The extra argument ``_use_flatbuffer`` makes a FlatBuffer file instead of a -zip file. The created file will be faster to load. - -For example, using ResNet-50 and running the following script: - -:: - - import torch - from torch.jit import mobile - import time - model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - jit_model = torch.jit.script(model) - - jit_model._save_for_lite_interpreter('/tmp/jit_model.ptl') - jit_model._save_for_lite_interpreter('/tmp/jit_model.ff', _use_flatbuffer=True) - - import timeit - print('Load ptl file:') - print(timeit.timeit('from torch.jit import mobile; mobile._load_for_lite_interpreter("/tmp/jit_model.ptl")', - number=20)) - print('Load flatbuffer file:') - print(timeit.timeit('from torch.jit import mobile; mobile._load_for_lite_interpreter("/tmp/jit_model.ff")', - number=20)) - - - -you would get the following result: - -:: - - Load ptl file: - 0.5387594579999999 - Load flatbuffer file: - 0.038842832999999466 - -While speed ups on actual mobile devices will be smaller, you can still expect -3x - 6x load time reductions. - -### Reasons to avoid using a FlatBuffer-based mobile model - -However, FlatBuffer format also has some limitations that you might want to consider: - -* It is only available in PyTorch 1.13 or later. Therefore, client devices compiled - with earlier PyTorch versions might not be able to load it. -* The Flatbuffer library imposes a 4GB limit for file sizes. So it is not suitable - for large models. - -Benchmarking ------------- - -The best way to benchmark (to check if optimizations helped your use case) - is to measure your particular use case that you want to optimize, as performance behavior can vary in different environments. - -PyTorch distribution provides a way to benchmark naked binary that runs the model forward, -this approach can give more stable measurements rather than testing inside the application. - - -Android - Benchmarking Setup -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This part of the recipe is Android only. - -For this you first need to build benchmark binary: - -:: - - - rm -rf build_android - BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DBUILD_BINARY=ON - -You should have arm64 binary at: ``build_android/bin/speed_benchmark_torch``. -This binary takes ``--model=``, ``--input_dim="1,3,224,224"`` as dimension information for the input and ``--input_type="float"`` as the type of the input as arguments. - -Once you have your android device connected, -push speedbenchark_torch binary and your model to the phone: - -:: - - adb push /data/local/tmp - adb push /data/local/tmp - - -Now we are ready to benchmark your model: - -:: - - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float" - ----- output ----- - Starting benchmark. - Running warmup runs. - Main runs. - Main run finished. Microseconds per iter: 121318. Iters per second: 8.24281 - - -iOS - Benchmarking Setup -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For iOS, we'll be using our `TestApp `_ as the benchmarking tool. - -To begin with, let's apply the ``optimize_for_mobile`` method to our python script located at `TestApp/benchmark/trace_model.py `_. Simply modify the code as below. - -:: - - import torch - import torchvision - from torch.utils.mobile_optimizer import optimize_for_mobile - - model = torchvision.models.mobilenet_v2(pretrained=True) - model.eval() - example = torch.rand(1, 3, 224, 224) - traced_script_module = torch.jit.trace(model, example) - torchscript_model_optimized = optimize_for_mobile(traced_script_module) - torch.jit.save(torchscript_model_optimized, "model.pt") - -Now let's run ``python trace_model.py``. If everything works well, we should be able to generate our optimized model in the benchmark directory. - -Next, we're going to build the PyTorch libraries from source. - -:: - - BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 ./scripts/build_ios.sh - -Now that we have the optimized model and PyTorch ready, it's time to generate our XCode project and do benchmarking. To do that, we'll be using a ruby script - `setup.rb` which does the heavy lifting jobs of setting up the XCode project. - -:: - - ruby setup.rb - -Now open the `TestApp.xcodeproj` and plug in your iPhone, you're ready to go. Below is an example result from iPhoneX - -:: - - TestApp[2121:722447] Main runs - TestApp[2121:722447] Main run finished. Milliseconds per iter: 28.767 - TestApp[2121:722447] Iters per second: : 34.762 - TestApp[2121:722447] Done. + diff --git a/recipes_source/model_preparation_android.rst b/recipes_source/model_preparation_android.rst index 55ef7d9735c..22c0e17df31 100644 --- a/recipes_source/model_preparation_android.rst +++ b/recipes_source/model_preparation_android.rst @@ -1,85 +1,10 @@ Model Preparation for Android Recipe ===================================== -This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for Android apps, and how to set up Android projects to use the mobile-ready model file. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------------ +Redirecting in 3 seconds... -After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see the `Quantization Recipe `_), converted to TorchScript so Android apps can load it, and optimized for mobile apps. Furthermore, Android apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference. +.. raw:: html -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Android Studio 3.5.1 or above with NDK installed - -Steps ------------------ - -1. Get Pretrained and Quantized MobileNet v2 Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - -2. Script and Optimize the Model for Mobile Apps -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use either the `script` or `trace` method to convert the quantized model to the TorchScript format: - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - -or - -:: - - torchscript_model = torch.jit.script(model_quantized) - - -.. warning:: - The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe `_ for more details. - -Then optimize the TorchScript formatted model for mobile and save it: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt") - -With the total 7 or 8 (depending on if the `script` or `trace` method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps. - -3. Add the Model and PyTorch Library on Android -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* In your current or a new Android Studio project, open the build.gradle file, and add the following two lines (the second one is required only if you plan to use a TorchVision model): - -:: - - implementation 'org.pytorch:pytorch_android:1.6.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.6.0' - -* Drag and drop the model file `mobilenetv2_quantized.pt` to your project's assets folder. - -That's it! Now you can build your Android app with the PyTorch library and the model ready to use. To actually write code to use the model, refer to the PyTorch Mobile `Android Quickstart with a HelloWorld Example `_ and `Android Hackathon Example `_. - -Learn More ------------------ - -1. `PyTorch Mobile site `_ - -2. `Introduction to TorchScript `_ + diff --git a/recipes_source/model_preparation_ios.rst b/recipes_source/model_preparation_ios.rst index 2fbacd7fa68..cbb4927eaeb 100644 --- a/recipes_source/model_preparation_ios.rst +++ b/recipes_source/model_preparation_ios.rst @@ -1,95 +1,10 @@ Model Preparation for iOS Recipe ===================================== -This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for iOS apps, and how to set up an iOS project to use the mobile-ready model file. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------------ +Redirecting in 3 seconds... -After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see `Quantization Recipe `_ for more details), converted to TorchScript so iOS apps can load it and optimized for mobile apps (see `Script and Optimize for Mobile Recipe `_). Furthermore, iOS apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference. +.. raw:: html -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Xcode 11 or 12 - -Steps ------------------ - -1. Get Pretrained and Quantized MobileNet v2 Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - -2. Script and Optimize the Model for Mobile Apps -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use either the script or trace method to convert the quantized model to the TorchScript format: - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - -or - -:: - - torchscript_model = torch.jit.script(model_quantized) - -.. warning:: - The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe `_ for more details. - - -Then optimize the TorchScript formatted model for mobile and save it: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt") - -With the total 7 or 8 (depending on if the script or trace method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps. - -3. Add the Model and PyTorch Library on iOS -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To use the mobile-ready model `mobilenetv2_quantized.pt` in an iOS app, either create a new Xcode project or in your existing Xcode project, then follow the steps below: - -* Open a Mac Terminal, cd to your iOS app's project folder; - -* If your iOS app does not use Cocoapods yet, run `pod init` first to generate the `Podfile` file. - -* Edit `Podfile` either from Xcode or any editor, and add the following line under the target: - -:: - - pod 'LibTorch', '~>1.6.1' - -* Run `pod install` from the Terminal and then open your project's xcworkspace file; - -* Save the two files `TorchModule.h` and `TorchModule.mm` from `here `_ and drag and drop them to your project. If your project is Swift based, a message box with the title "Would you like to configure an Objective-C bridging header?" will show up; click the "Create Bridging Header" button to create a Swift to Objective-c bridging header file, and add `#import "TorchModule.h"` to the header file `-Bridging-Header.h`; - -* Drag and drop the model file `mobilenetv2_quantized.pt` to the project. - -After these steps, you can successfully build and run your Xcode project. To actually write code to use the model, refer to the PyTorch Mobile `iOS Code Walkthrough `_ and two complete ready-to-run sample iOS apps `HelloWorld `_ and `iOS Hackathon Example `_. - - -Learn More ------------------ - -1. `PyTorch Mobile site `_ - -2. `Introduction to TorchScript `_ + diff --git a/recipes_source/profile_with_itt.rst b/recipes_source/profile_with_itt.rst index 7ddb1ab3fee..566fd614f22 100644 --- a/recipes_source/profile_with_itt.rst +++ b/recipes_source/profile_with_itt.rst @@ -58,6 +58,10 @@ Launch Intel® VTune™ Profiler To verify the functionality, you need to start an Intel® VTune™ Profiler instance. Please check the `Intel® VTune™ Profiler User Guide `__ for steps to launch Intel® VTune™ Profiler. +.. note:: + Users can also use web-server-ui by following `Intel® VTune™ Profiler Web Server UI Guide `__ + ex : vtune-backend --web-port=8080 --allow-remote-access --enable-server-profiling + Once you get the Intel® VTune™ Profiler GUI launched, you should see a user interface as below: .. figure:: /_static/img/itt_tutorial/vtune_start.png @@ -66,8 +70,8 @@ Once you get the Intel® VTune™ Profiler GUI launched, you should see a user i Three sample results are available on the left side navigation bar under `sample (matrix)` project. If you do not want profiling results appear in this default sample project, you can create a new project via the button `New Project...` under the blue `Configure Analysis...` button. To start a new profiling, click the blue `Configure Analysis...` button to initiate configuration of the profiling. -Configure Profiling -~~~~~~~~~~~~~~~~~~~ +Configure Profiling for CPU +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Once you click the `Configure Analysis...` button, you should see the screen below: @@ -77,6 +81,16 @@ Once you click the `Configure Analysis...` button, you should see the screen bel The right side of the windows is split into 3 parts: `WHERE` (top left), `WHAT` (bottom left), and `HOW` (right). With `WHERE`, you can assign a machine where you want to run the profiling on. With `WHAT`, you can set the path of the application that you want to profile. To profile a PyTorch script, it is recommended to wrap all manual steps, including activating a Python environment and setting required environment variables, into a bash script, then profile this bash script. In the screenshot above, we wrapped all steps into the `launch.sh` bash script and profile `bash` with the parameter to be ``. On the right side `HOW`, you can choose whatever type that you would like to profile. Intel® VTune™ Profiler provides a bunch of profiling types that you can choose from. Details can be found at `Intel® VTune™ Profiler User Guide `__. + +Configure Profiling for XPU +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pick GPU Offload Profiling Type instead of Hotspots, and follow the same instructions as CPU to Launch the Application. + +.. figure:: /_static/img/itt_tutorial/vtune_xpu_config.png + :width: 100% + :align: center + + Read Profiling Result ~~~~~~~~~~~~~~~~~~~~~ @@ -101,6 +115,18 @@ As illustrated on the right side navigation bar, brown portions in the timeline Of course there are much more enriched sets of profiling features that Intel® VTune™ Profiler provides to help you understand a performance issue. When you understand the root cause of a performance issue, you can get it fixed. More detailed usage instructions are available at `Intel® VTune™ Profiler User Guide `__. +Read XPU Profiling Result +~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a successful profiling with ITT, you can open `Platform` tab of the profiling result to see labels in the Intel® VTune™ Profiler timeline. + +.. figure:: /_static/img/itt_tutorial/vtune_xpu_timeline.png + :width: 100% + :align: center + + +The timeline shows the main thread as a `python` thread on the top. Labeled PyTorch operators and customized regions are shown in the main thread row. All operators starting with `aten::` are operators labeled implicitly by the ITT feature in PyTorch. The timeline also shows the GPU Computing Queue on the top, and users could see different XPU Kernels dispatched into GPU Queue. + A short sample code showcasing how to use PyTorch ITT APIs ---------------------------------------------------------- @@ -128,8 +154,12 @@ The topology is formed by two operators, `Conv2d` and `Linear`. Three iterations return x def main(): - m = ITTSample() + m = ITTSample + # unmark below code for XPU + # m = m.to("xpu") x = torch.rand(10, 3, 244, 244) + # unmark below code for XPU + # x = x.to("xpu") with torch.autograd.profiler.emit_itt(): for i in range(3) # Labeling a region with pair of range_push and range_pop diff --git a/recipes_source/ptmobile_recipes_summary.rst b/recipes_source/ptmobile_recipes_summary.rst index cddee940f2a..fdf9f58e43d 100644 --- a/recipes_source/ptmobile_recipes_summary.rst +++ b/recipes_source/ptmobile_recipes_summary.rst @@ -1,40 +1,10 @@ Summary of PyTorch Mobile Recipes ===================================== -This summary provides a top level overview of recipes for PyTorch Mobile to help developers choose which recipes to follow for their PyTorch-powered mobile app development. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ----------------- +Redirecting in 3 seconds... -When a PyTorch model is trained or retrained, or when a pre-trained model is available, for mobile deployment, follow the the recipes outlined in this summary so mobile apps can successfully use the model. +.. raw:: html -Pre-requisites ----------------- - -PyTorch 1.6.0 or 1.7.0 - -(Optional) torchvision 0.6.0 or 0.7.0 - -For iOS development: Xcode 11 or 12 - -For Android development: Android Studio 3.5.1 or above (with NDK installed); or Android SDK, NDK, Gradle, JDK. - -New Recipes for PyTorch Mobile --------------------------------- - -* (Recommended) To fuse a list of PyTorch modules into a single module to reduce the model size before quantization, read the `Fuse Modules recipe `_. - -* (Recommended) To reduce the model size and make it run faster without losing much on accuracy, read the `Quantization Recipe `_. - -* (Must) To convert the model to TorchScipt and (optional) optimize it for mobile apps, read the `Script and Optimize for Mobile Recipe `_. - -* (Must for iOS development) To add the model in an iOS project and use PyTorch pod for iOS, read the `Model preparation for iOS Recipe `_. - -* (Must for Android development) To add the model in an Android project and use the PyTorch library for Android, read the `Model preparation for Android Recipe `_. - - -Learn More ------------------ - -1. `PyTorch Mobile site `_ -2. `PyTorch Mobile Performance Recipes `_ + diff --git a/recipes_source/quantization.rst b/recipes_source/quantization.rst deleted file mode 100644 index ac9cd48fe8c..00000000000 --- a/recipes_source/quantization.rst +++ /dev/null @@ -1,135 +0,0 @@ -Quantization Recipe -===================================== - -This recipe demonstrates how to quantize a PyTorch model so it can run with reduced size and faster inference speed with about the same accuracy as the original model. Quantization can be applied to both server and mobile model deployment, but it can be especially important or even critical on mobile, because a non-quantized model's size may exceed the limit that an iOS or Android app allows for, cause the deployment or OTA update to take too much time, and make the inference too slow for a good user experience. - -Introduction ------------- - -Quantization is a technique that converts 32-bit floating numbers in the model parameters to 8-bit integers. With quantization, the model size and memory footprint can be reduced to 1/4 of its original size, and the inference can be made about 2-4 times faster, while the accuracy stays about the same. - -There are overall three approaches or workflows to quantize a model: post training dynamic quantization, post training static quantization, and quantization aware training. But if the model you want to use already has a quantized version, you can use it directly without going through any of the three workflows above. For example, the `torchvision` library already includes quantized versions for models MobileNet v2, ResNet 18, ResNet 50, Inception v3, GoogleNet, among others. So we will make the last approach another workflow, albeit a simple one. - -.. note:: - The quantization support is available for a limited set of operators. See `this `_ for more information. - -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Workflows ------------- - -Use one of the four workflows below to quantize a model. - -1. Use Pretrained Quantized MobileNet v2 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - - -To compare the size difference of a non-quantized MobileNet v2 model with its quantized version: - -:: - - model = torchvision.models.mobilenet_v2(pretrained=True) - - import os - import torch - - def print_model_size(mdl): - torch.save(mdl.state_dict(), "tmp.pt") - print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6)) - os.remove('tmp.pt') - - print_model_size(model) - print_model_size(model_quantized) - - -The outputs will be: - -:: - - 14.27 MB - 3.63 MB - -2. Post Training Dynamic Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To apply Dynamic Quantization, which converts all the weights in a model from 32-bit floating numbers to 8-bit integers but doesn't convert the activations to int8 till just before performing the computation on the activations, simply call `torch.quantization.quantize_dynamic`: - -:: - - model_dynamic_quantized = torch.quantization.quantize_dynamic( - model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8 - ) - -where `qconfig_spec` specifies the list of submodule names in `model` to apply quantization to. - -.. warning:: An important limitation of Dynamic Quantization, while it is the easiest workflow if you do not have a pre-trained quantized model ready for use, is that it currently only supports `nn.Linear` and `nn.LSTM` in `qconfig_spec`, meaning that you will have to use Static Quantization or Quantization Aware Training, to be discussed later, to quantize other modules such as `nn.Conv2d`. - -The full documentation of the `quantize_dynamic` API call is `here `_. Three other examples of using the post training dynamic quantization are `the Bert example `_, `an LSTM model example `_, and another `demo LSTM example `_. - -3. Post Training Static Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This method converts both the weights and the activations to 8-bit integers beforehand so there won’t be on-the-fly conversion on the activations during the inference, as the dynamic quantization does. While post-training static quantization can significantly enhance inference speed and reduce model size, this method may degrade the original model's accuracy more compared to post training dynamic quantization. - -To apply static quantization on a model, run the following code: - -:: - - backend = "qnnpack" - model.qconfig = torch.quantization.get_default_qconfig(backend) - torch.backends.quantized.engine = backend - model_static_quantized = torch.quantization.prepare(model, inplace=False) - model_static_quantized = torch.quantization.convert(model_static_quantized, inplace=False) - -After this, running `print_model_size(model_static_quantized)` shows the static quantized model is `3.98MB`. - -A complete model definition and static quantization example is `here `_. A dedicated static quantization tutorial is `here `_. - -.. note:: - To make the model run on mobile devices which normally have arm architecture, you need to use `qnnpack` for `backend`; to run the model on computer with x86 architecture, use `x86`` (the old `fbgemm` is still available but 'x86' is the recommended default). - -4. Quantization Aware Training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Quantization aware training inserts fake quantization to all the weights and activations during the model training process and results in higher inference accuracy than the post-training quantization methods. It is typically used in CNN models. - -To enable a model for quantization aware traing, define in the `__init__` method of the model definition a `QuantStub` and a `DeQuantStub` to convert tensors from floating point to quantized type and vice versa: - -:: - - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - -Then in the beginning and the end of the `forward` method of the model definition, call `x = self.quant(x)` and `x = self.dequant(x)`. - -To do a quantization aware training, use the following code snippet: - -:: - - model.qconfig = torch.quantization.get_default_qat_qconfig(backend) - model_qat = torch.quantization.prepare_qat(model, inplace=False) - # quantization aware training goes here - model_qat = torch.quantization.convert(model_qat.eval(), inplace=False) - -For more detailed examples of the quantization aware training, see `here `_ and `here `_. - -A pre-trained quantized model can also be used for quantized aware transfer learning, using the same `quant` and `dequant` calls shown above. See `here `_ for a complete example. - -After a quantized model is generated using one of the steps above, before the model can be used to run on mobile devices, it needs to be further converted to the `TorchScript` format and then optimized for mobile apps. See the `Script and Optimize for Mobile recipe `_ for details. - -Learn More ------------------ - -For more info on the different workflows of quantization, see `here `_ and `here `_. diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt index 18e4d7106b1..4ed6d351ae3 100644 --- a/recipes_source/recipes/README.txt +++ b/recipes_source/recipes/README.txt @@ -25,34 +25,22 @@ PyTorch Recipes Dynamic Quantization https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html -7. save_load_across_devices.py - Saving and loading models across devices in PyTorch - https://pytorch.org/tutorials/recipes/recipes/save_load_across_devices.html - -8. saving_and_loading_a_general_checkpoint.py - Saving and loading a general checkpoint in PyTorch - https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html - -9. saving_and_loading_models_for_inference.py - Saving and loading models for inference in PyTorch - https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html - -10. saving_multiple_models_in_one_file.py - Saving and loading multiple models in one file using PyTorch - https://pytorch.org/tutorials/recipes/recipes/saving_multiple_models_in_one_file.html - -11. warmstarting_model_using_parameters_from_a_different_model.py +7. warmstarting_model_using_parameters_from_a_different_model.py Warmstarting models using parameters from different model https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html -12. zeroing_out_gradients.py +8. zeroing_out_gradients.py Zeroing out gradients https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html -13. mobile_perf.py +9. mobile_perf.py PyTorch Mobile Performance Recipes https://pytorch.org/tutorials/recipes/mobile_perf.html -14. amp_recipe.py +10. amp_recipe.py Automatic Mixed Precision https://pytorch.org/tutorials/recipes/amp_recipe.html + +11. regional_compilation.py + Reducing torch.compile cold start compilation time with regional compilation + https://pytorch.org/tutorials/recipes/regional_compilation.html diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index b8a4d942333..91ce19a93a9 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -150,7 +150,7 @@ def make_model(in_size, out_size, num_layers): # The same ``GradScaler`` instance should be used for the entire convergence run. # If you perform multiple convergence runs in the same script, each run should use # a dedicated fresh ``GradScaler`` instance. ``GradScaler`` instances are lightweight. -scaler = torch.cuda.amp.GradScaler() +scaler = torch.amp.GradScaler("cuda") for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): @@ -182,7 +182,7 @@ def make_model(in_size, out_size, num_layers): net = make_model(in_size, out_size, num_layers) opt = torch.optim.SGD(net.parameters(), lr=0.001) -scaler = torch.cuda.amp.GradScaler(enabled=use_amp) +scaler = torch.amp.GradScaler("cuda" ,enabled=use_amp) start_timer() for epoch in range(epochs): diff --git a/recipes_source/recipes/benchmark.py b/recipes_source/recipes/benchmark.py index d02157a83e4..96fdf109bde 100644 --- a/recipes_source/recipes/benchmark.py +++ b/recipes_source/recipes/benchmark.py @@ -292,7 +292,7 @@ def batched_dot_bmm(a, b): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # While ``timeit.Timer.autorange`` takes a single continuous measurement -# of at least 0.2 seconds, `torch.utils.benchmark.blocked_autorange` +# of at least 0.2 seconds, `torch.utils.benchmark.Timer.blocked_autorange` # takes many measurements whose times total at least 0.2 seconds (which # can be changed by the `min_run_time` parameter) subject to the constraint # that timing overhead is a small fraction of the overall measurement. @@ -884,5 +884,5 @@ def pretty_print(result): # # Take a look at these other recipes to continue your learning: # -# - `PyTorch Profiler `_ +# - `PyTorch Profiler `_ # diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py deleted file mode 100644 index eb9605d0c63..00000000000 --- a/recipes_source/recipes/dynamic_quantization.py +++ /dev/null @@ -1,294 +0,0 @@ -""" -Dynamic Quantization -==================== - -In this recipe you will see how to take advantage of Dynamic -Quantization to accelerate inference on an LSTM-style recurrent neural -network. This reduces the size of the model weights and speeds up model -execution. - -Introduction -------------- - -There are a number of trade-offs that can be made when designing neural -networks. During model development and training you can alter the -number of layers and number of parameters in a recurrent neural network -and trade-off accuracy against model size and/or model latency or -throughput. Such changes can take lot of time and compute resources -because you are iterating over the model training. Quantization gives -you a way to make a similar trade off between performance and model -accuracy with a known model after training is completed. - -You can give it a try in a single session and you will certainly reduce -your model size significantly and may get a significant latency -reduction without losing a lot of accuracy. - -What is dynamic quantization? ------------------------------ - -Quantizing a network means converting it to use a reduced precision -integer representation for the weights and/or activations. This saves on -model size and allows the use of higher throughput math operations on -your CPU or GPU. - -When converting from floating point to integer values you are -essentially multiplying the floating point value by some scale factor -and rounding the result to a whole number. The various quantization -approaches differ in the way they approach determining that scale -factor. - -The key idea with dynamic quantization as described here is that we are -going to determine the scale factor for activations dynamically based on -the data range observed at runtime. This ensures that the scale factor -is "tuned" so that as much signal as possible about each observed -dataset is preserved. - -The model parameters on the other hand are known during model conversion -and they are converted ahead of time and stored in INT8 form. - -Arithmetic in the quantized model is done using vectorized INT8 -instructions. Accumulation is typically done with INT16 or INT32 to -avoid overflow. This higher precision value is scaled back to INT8 if -the next layer is quantized or converted to FP32 for output. - -Dynamic quantization is relatively free of tuning parameters which makes -it well suited to be added into production pipelines as a standard part -of converting LSTM models to deployment. - - - -.. note:: - Limitations on the approach taken here - - - This recipe provides a quick introduction to the dynamic quantization - features in PyTorch and the workflow for using it. Our focus is on - explaining the specific functions used to convert the model. We will - make a number of significant simplifications in the interest of brevity - and clarity - - -1. You will start with a minimal LSTM network -2. You are simply going to initialize the network with a random hidden - state -3. You are going to test the network with random inputs -4. You are not going to train the network in this tutorial -5. You will see that the quantized form of this network is smaller and - runs faster than the floating point network we started with -6. You will see that the output values are generally in the same - ballpark as the output of the FP32 network, but we are not - demonstrating here the expected accuracy loss on a real trained - network - -You will see how dynamic quantization is done and be able to see -suggestive reductions in memory use and latency times. Providing a -demonstration that the technique can preserve high levels of model -accuracy on a trained LSTM is left to a more advanced tutorial. If you -want to move right away to that more rigorous treatment please proceed -to the `advanced dynamic quantization -tutorial `__. - -Steps -------------- - -This recipe has 5 steps. - -1. Set Up - Here you define a very simple LSTM, import modules, and establish - some random input tensors. - -2. Do the Quantization - Here you instantiate a floating point model and then create quantized - version of it. - -3. Look at Model Size - Here you show that the model size gets smaller. - -4. Look at Latency - Here you run the two models and compare model runtime (latency). - -5. Look at Accuracy - Here you run the two models and compare outputs. - - -1: Set Up -~~~~~~~~~~~~~~~ -This is a straightforward bit of code to set up for the rest of the -recipe. - -The unique module we are importing here is torch.quantization which -includes PyTorch's quantized operators and conversion functions. We also -define a very simple LSTM model and set up some inputs. - -""" - -# import the modules used here in this recipe -import torch -import torch.quantization -import torch.nn as nn -import copy -import os -import time - -# define a very, very simple LSTM for demonstration purposes -# in this case, we are wrapping ``nn.LSTM``, one layer, no preprocessing or postprocessing -# inspired by -# `Sequence Models and Long Short-Term Memory Networks tutorial `__. -class lstm_for_demonstration(nn.Module): - """Elementary Long Short Term Memory style model which simply wraps ``nn.LSTM`` - Not to be used for anything other than demonstration. - """ - def __init__(self,in_dim,out_dim,depth): - super(lstm_for_demonstration,self).__init__() - self.lstm = nn.LSTM(in_dim,out_dim,depth) - - def forward(self,inputs,hidden): - out,hidden = self.lstm(inputs,hidden) - return out, hidden - - -torch.manual_seed(29592) # set the seed for reproducibility - -#shape parameters -model_dimension=8 -sequence_length=20 -batch_size=1 -lstm_depth=1 - -# random data for input -inputs = torch.randn(sequence_length,batch_size,model_dimension) -# hidden is actually is a tuple of the initial hidden state and the initial cell state -hidden = (torch.randn(lstm_depth,batch_size,model_dimension), torch.randn(lstm_depth,batch_size,model_dimension)) - - -###################################################################### -# 2: Do the Quantization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now we get to the fun part. First we create an instance of the model -# called ``float\_lstm`` then we are going to quantize it. We're going to use -# the `torch.quantization.quantize_dynamic `__ function, which takes the model, then a list of the submodules -# which we want to -# have quantized if they appear, then the datatype we are targeting. This -# function returns a quantized version of the original model as a new -# module. -# -# That's all it takes. -# - - # here is our floating point instance -float_lstm = lstm_for_demonstration(model_dimension, model_dimension,lstm_depth) - -# this is the call that does the work -quantized_lstm = torch.quantization.quantize_dynamic( - float_lstm, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -# show the changes that were made -print('Here is the floating point version of this module:') -print(float_lstm) -print('') -print('and now the quantized version:') -print(quantized_lstm) - - -###################################################################### -# 3. Look at Model Size -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We've quantized the model. What does that get us? Well the first -# benefit is that we've replaced the FP32 model parameters with INT8 -# values (and some recorded scale factors). This means about 75% less data -# to store and move around. With the default values the reduction shown -# below will be less than 75% but if you increase the model size above -# (for example you can set model dimension to something like 80) this will -# converge towards 4x smaller as the stored model size dominated more and -# more by the parameter values. -# - -def print_size_of_model(model, label=""): - torch.save(model.state_dict(), "temp.p") - size=os.path.getsize("temp.p") - print("model: ",label,' \t','Size (KB):', size/1e3) - os.remove('temp.p') - return size - -# compare the sizes -f=print_size_of_model(float_lstm,"fp32") -q=print_size_of_model(quantized_lstm,"int8") -print("{0:.2f} times smaller".format(f/q)) - - -###################################################################### -# 4. Look at Latency -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# The second benefit is that the quantized model will typically run -# faster. This is due to a combinations of effects including at least: -# -# 1. Less time spent moving parameter data in -# 2. Faster INT8 operations -# -# As you will see the quantized version of this super-simple network runs -# faster. This will generally be true of more complex networks but as they -# say "your mileage may vary" depending on a number of factors including -# the structure of the model and the hardware you are running on. -# - -# compare the performance -print("Floating point FP32") - -##################################################################### -# .. code-block:: python -# -# %timeit float_lstm.forward(inputs, hidden) - -print("Quantized INT8") - -###################################################################### -# .. code-block:: python -# -# %timeit quantized_lstm.forward(inputs,hidden) - - -###################################################################### -# 5: Look at Accuracy -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We are not going to do a careful look at accuracy here because we are -# working with a randomly initialized network rather than a properly -# trained one. However, I think it is worth quickly showing that the -# quantized network does produce output tensors that are "in the same -# ballpark" as the original one. -# -# For a more detailed analysis please see the more advanced tutorials -# referenced at the end of this recipe. -# - -# run the float model -out1, hidden1 = float_lstm(inputs, hidden) -mag1 = torch.mean(abs(out1)).item() -print('mean absolute value of output tensor values in the FP32 model is {0:.5f} '.format(mag1)) - -# run the quantized model -out2, hidden2 = quantized_lstm(inputs, hidden) -mag2 = torch.mean(abs(out2)).item() -print('mean absolute value of output tensor values in the INT8 model is {0:.5f}'.format(mag2)) - -# compare them -mag3 = torch.mean(abs(out1-out2)).item() -print('mean absolute value of the difference between the output tensors is {0:.5f} or {1:.2f} percent'.format(mag3,mag3/mag1*100)) - - -###################################################################### -# Learn More -# ------------ -# We've explained what dynamic quantization is, what benefits it brings, -# and you have used the ``torch.quantization.quantize_dynamic()`` function -# to quickly quantize a simple LSTM model. -# -# This was a fast and high level treatment of this material; for more -# detail please continue learning with `(beta) Dynamic Quantization on an LSTM Word Language Model Tutorial `_. -# -# -# Additional Resources -# -------------------- -# -# * `Quantization API Documentaion `_ -# * `(beta) Dynamic Quantization on BERT `_ -# * `(beta) Dynamic Quantization on an LSTM Word Language Model `_ -# * `Introduction to Quantization on PyTorch `_ -# diff --git a/recipes_source/recipes/module_load_state_dict_tips.py b/recipes_source/recipes/module_load_state_dict_tips.py index 17c812b016f..70e9830cb3c 100644 --- a/recipes_source/recipes/module_load_state_dict_tips.py +++ b/recipes_source/recipes/module_load_state_dict_tips.py @@ -39,7 +39,7 @@ def forward(self, x): # to ``torch.load``, the ``torch.device()`` context manager and the ``assign`` # keyword argument to ``nn.Module.load_state_dict()``. -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) with torch.device('meta'): meta_m = SomeModule(1000) meta_m.load_state_dict(state_dict, assign=True) @@ -47,7 +47,7 @@ def forward(self, x): ############################################################################# # Compare the snippet below to the one above: -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) m = SomeModule(1000) m.load_state_dict(state_dict) @@ -71,7 +71,7 @@ def forward(self, x): # * Waiting for the entire checkpoint to be loaded into RAM before performing, for example, some per-tensor processing. start_time = time.time() -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) end_time = time.time() print(f"loading time without mmap={end_time - start_time}") @@ -84,7 +84,7 @@ def forward(self, x): # storages will be memory-mapped. start_time = time.time() -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) end_time = time.time() print(f"loading time with mmap={end_time - start_time}") diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py index 47d9f86d8a8..a8d1a4dc6b3 100644 --- a/recipes_source/recipes/profiler_recipe.py +++ b/recipes_source/recipes/profiler_recipe.py @@ -1,28 +1,33 @@ """ PyTorch Profiler ==================================== -This recipe explains how to use PyTorch profiler and measure the time and -memory consumption of the model's operators. - -Introduction ------------- -PyTorch includes a simple profiler API that is useful when user needs -to determine the most expensive operators in the model. - -In this recipe, we will use a simple Resnet model to demonstrate how to -use profiler to analyze model performance. - -Setup ------ -To install ``torch`` and ``torchvision`` use the following command: - -.. code-block:: sh - - pip install torch torchvision - - +**Author:** `Shivam Raikundalia `_ """ +###################################################################### +# This recipe explains how to use PyTorch profiler and measure the time and +# memory consumption of the model's operators. +# +# Introduction +# ------------ +# PyTorch includes a simple profiler API that is useful when the user needs +# to determine the most expensive operators in the model. +# +# In this recipe, we will use a simple Resnet model to demonstrate how to +# use the profiler to analyze model performance. +# +# Prerequisites +# --------------- +# - ``torch >= 2.3.0`` +# +# Setup +# ----- +# To install ``torch`` and ``torchvision`` use the following command: +# +# .. code-block:: sh +# +# pip install torch torchvision +# ###################################################################### # Steps @@ -45,7 +50,7 @@ import torch import torchvision.models as models -from torch.profiler import profile, record_function, ProfilerActivity +from torch.profiler import profile, ProfilerActivity, record_function ###################################################################### @@ -70,10 +75,10 @@ # - ``ProfilerActivity.CPU`` - PyTorch operators, TorchScript functions and # user-defined code labels (see ``record_function`` below); # - ``ProfilerActivity.CUDA`` - on-device CUDA kernels; +# - ``ProfilerActivity.XPU`` - on-device XPU kernels; # - ``record_shapes`` - whether to record shapes of the operator inputs; # - ``profile_memory`` - whether to report amount of memory consumed by # model's Tensors; -# - ``use_cuda`` - whether to measure execution time of CUDA kernels. # # Note: when using CUDA, profiler also shows the runtime CUDA events # occurring on the host. @@ -105,22 +110,24 @@ ###################################################################### # The output will look like (omitting some columns): - -# --------------------------------- ------------ ------------ ------------ ------------ -# Name Self CPU CPU total CPU time avg # of Calls -# --------------------------------- ------------ ------------ ------------ ------------ -# model_inference 5.509ms 57.503ms 57.503ms 1 -# aten::conv2d 231.000us 31.931ms 1.597ms 20 -# aten::convolution 250.000us 31.700ms 1.585ms 20 -# aten::_convolution 336.000us 31.450ms 1.573ms 20 -# aten::mkldnn_convolution 30.838ms 31.114ms 1.556ms 20 -# aten::batch_norm 211.000us 14.693ms 734.650us 20 -# aten::_batch_norm_impl_index 319.000us 14.482ms 724.100us 20 -# aten::native_batch_norm 9.229ms 14.109ms 705.450us 20 -# aten::mean 332.000us 2.631ms 125.286us 21 -# aten::select 1.668ms 2.292ms 8.988us 255 -# --------------------------------- ------------ ------------ ------------ ------------ -# Self CPU time total: 57.549m +# +# .. code-block:: sh +# +# --------------------------------- ------------ ------------ ------------ ------------ +# Name Self CPU CPU total CPU time avg # of Calls +# --------------------------------- ------------ ------------ ------------ ------------ +# model_inference 5.509ms 57.503ms 57.503ms 1 +# aten::conv2d 231.000us 31.931ms 1.597ms 20 +# aten::convolution 250.000us 31.700ms 1.585ms 20 +# aten::_convolution 336.000us 31.450ms 1.573ms 20 +# aten::mkldnn_convolution 30.838ms 31.114ms 1.556ms 20 +# aten::batch_norm 211.000us 14.693ms 734.650us 20 +# aten::_batch_norm_impl_index 319.000us 14.482ms 724.100us 20 +# aten::native_batch_norm 9.229ms 14.109ms 705.450us 20 +# aten::mean 332.000us 2.631ms 125.286us 21 +# aten::select 1.668ms 2.292ms 8.988us 255 +# --------------------------------- ------------ ------------ ------------ ------------ +# Self CPU time total: 57.549m # ###################################################################### @@ -133,7 +140,11 @@ # To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True`` # (note: this requires running the profiler with ``record_shapes=True``): -print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)) +print( + prof.key_averages(group_by_input_shape=True).table( + sort_by="cpu_time_total", row_limit=10 + ) +) ######################################################################################## # The output might look like this (omitting some columns): @@ -162,16 +173,32 @@ ###################################################################### # Profiler can also be used to analyze performance of models executed on GPUs: - -model = models.resnet18().cuda() -inputs = torch.randn(5, 3, 224, 224).cuda() - -with profile(activities=[ - ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: +# Users could switch between cpu, cuda and xpu +activities = [ProfilerActivity.CPU] +if torch.cuda.is_available(): + device = "cuda" + activities += [ProfilerActivity.CUDA] +elif torch.xpu.is_available(): + device = "xpu" + activities += [ProfilerActivity.XPU] +else: + print( + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" + ) + import sys + + sys.exit(0) + +sort_by_keyword = device + "_time_total" + +model = models.resnet18().to(device) +inputs = torch.randn(5, 3, 224, 224).to(device) + +with profile(activities=activities, record_shapes=True) as prof: with record_function("model_inference"): model(inputs) -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) +print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10)) ###################################################################### # (Note: the first use of CUDA profiling may bring an extra overhead.) @@ -200,7 +227,33 @@ # ###################################################################### -# Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN``). +# (Note: the first use of XPU profiling may bring an extra overhead.) + +###################################################################### +# The resulting table output (omitting some columns): +# +# .. code-block:: sh +# +# ------------------------------ ------------ ------------ ------------ ------------ ------------ +# Name Self XPU Self XPU % XPU total XPU time avg # of Calls +# ------------------------------ ------------ ------------ ------------ ------------ ------------ +# model_inference 0.000us 0.00% 2.567ms 2.567ms 1 +# aten::conv2d 0.000us 0.00% 1.871ms 93.560us 20 +# aten::convolution 0.000us 0.00% 1.871ms 93.560us 20 +# aten::_convolution 0.000us 0.00% 1.871ms 93.560us 20 +# aten::convolution_overrideable 1.871ms 72.89% 1.871ms 93.560us 20 +# gen_conv 1.484ms 57.82% 1.484ms 74.216us 20 +# aten::batch_norm 0.000us 0.00% 432.640us 21.632us 20 +# aten::_batch_norm_impl_index 0.000us 0.00% 432.640us 21.632us 20 +# aten::native_batch_norm 432.640us 16.85% 432.640us 21.632us 20 +# conv_reorder 386.880us 15.07% 386.880us 6.448us 60 +# ------------------------------ ------------ ------------ ------------ ------------ ------------ +# Self CPU time total: 712.486ms +# Self XPU time total: 2.567ms +# + +###################################################################### +# Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN`` for CUDA or ``gen_conv`` for XPU). ###################################################################### # 4. Using profiler to analyze memory consumption @@ -215,8 +268,9 @@ model = models.resnet18() inputs = torch.randn(5, 3, 224, 224) -with profile(activities=[ProfilerActivity.CPU], - profile_memory=True, record_shapes=True) as prof: +with profile( + activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True +) as prof: model(inputs) print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) @@ -267,17 +321,33 @@ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Profiling results can be outputted as a ``.json`` trace file: - -model = models.resnet18().cuda() -inputs = torch.randn(5, 3, 224, 224).cuda() - -with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: +# Tracing CUDA or XPU kernels +# Users could switch between cpu, cuda and xpu +activities = [ProfilerActivity.CPU] +if torch.cuda.is_available(): + device = "cuda" + activities += [ProfilerActivity.CUDA] +elif torch.xpu.is_available(): + device = "xpu" + activities += [ProfilerActivity.XPU] +else: + print( + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" + ) + import sys + + sys.exit(0) + +model = models.resnet18().to(device) +inputs = torch.randn(5, 3, 224, 224).to(device) + +with profile(activities=activities) as prof: model(inputs) prof.export_chrome_trace("trace.json") ###################################################################### -# You can examine the sequence of profiled operators and CUDA kernels +# You can examine the sequence of profiled operators and CUDA/XPU kernels # in Chrome trace viewer (``chrome://tracing``): # # .. image:: ../../_static/img/trace_img.png @@ -288,15 +358,17 @@ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Profiler can be used to analyze Python and TorchScript stack traces: +sort_by_keyword = "self_" + device + "_time_total" with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + activities=activities, with_stack=True, + experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True), ) as prof: model(inputs) # Print aggregated stats -print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) +print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2)) ################################################################################# # The output might look like this (omitting some columns): @@ -346,12 +418,7 @@ from torch.profiler import schedule -my_schedule = schedule( - skip_first=10, - wait=5, - warmup=1, - active=3, - repeat=2) +my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2) ###################################################################### # Profiler assumes that the long-running job is composed of steps, numbered @@ -385,33 +452,32 @@ # To send the signal to the profiler that the next step has started, call ``prof.step()`` function. # The current profiler step is stored in ``prof.step_num``. # -# The following example shows how to use all of the concepts above: +# The following example shows how to use all of the concepts above for CUDA and XPU Kernels: + +sort_by_keyword = "self_" + device + "_time_total" + def trace_handler(p): - output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10) + output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10) print(output) p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json") + with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], - schedule=torch.profiler.schedule( - wait=1, - warmup=1, - active=2), - on_trace_ready=trace_handler + activities=activities, + schedule=torch.profiler.schedule(wait=1, warmup=1, active=2), + on_trace_ready=trace_handler, ) as p: for idx in range(8): model(inputs) p.step() - ###################################################################### # Learn More # ---------- # # Take a look at the following recipes/tutorials to continue your learning: # -# - `PyTorch Benchmark `_ -# - `PyTorch Profiler with TensorBoard `_ tutorial -# - `Visualizing models, data, and training with TensorBoard `_ tutorial +# - `PyTorch Benchmark `_ +# - `Visualizing models, data, and training with TensorBoard `_ tutorial # diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py deleted file mode 100644 index be950e15b13..00000000000 --- a/recipes_source/recipes/save_load_across_devices.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Saving and loading models across devices in PyTorch -=================================================== - -There may be instances where you want to save and load your neural -networks across different devices. - -Introduction ------------- - -Saving and loading models across devices is relatively straightforward -using PyTorch. In this recipe, we will experiment with saving and -loading models across CPUs and GPUs. - -Setup ------ - -In order for every code block to run properly in this recipe, you must -first change the runtime to “GPU” or higher. Once you do, we need to -install ``torch`` if it isn’t already available. - -.. code-block:: sh - - pip install torch - -""" - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Save on a GPU, load on a CPU -# 4. Save on a GPU, load on a GPU -# 5. Save on a CPU, load on a GPU -# 6. Saving and loading ``DataParallel`` models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Save on GPU, Load on CPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a CPU that was trained with a GPU, pass -# ``torch.device('cpu')`` to the ``map_location`` argument in the -# ``torch.load()`` function. -# - -# Specify a path to save to -PATH = "model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device('cpu') -model = Net() -model.load_state_dict(torch.load(PATH, map_location=device)) - - -###################################################################### -# In this case, the storages underlying the tensors are dynamically -# remapped to the CPU device using the ``map_location`` argument. -# -# 4. Save on GPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on GPU, simply -# convert the initialized model to a CUDA optimized model using -# ``model.to(torch.device('cuda'))``. -# -# Be sure to use the ``.to(torch.device('cuda'))`` function on all model -# inputs to prepare the data for the model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -model.load_state_dict(torch.load(PATH)) -model.to(device) - - -###################################################################### -# Note that calling ``my_tensor.to(device)`` returns a new copy of -# ``my_tensor`` on GPU. It does NOT overwrite ``my_tensor``. Therefore, -# remember to manually overwrite tensors: -# ``my_tensor = my_tensor.to(torch.device('cuda'))``. -# -# 5. Save on CPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on CPU, set the -# ``map_location`` argument in the ``torch.load()`` function to -# ``cuda:device_id``. This loads the model to a given GPU device. -# -# Be sure to call ``model.to(torch.device('cuda'))`` to convert the -# model’s parameter tensors to CUDA tensors. -# -# Finally, also be sure to use the ``.to(torch.device('cuda'))`` function -# on all model inputs to prepare the data for the CUDA optimized model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -# Choose whatever GPU device number you want -model.load_state_dict(torch.load(PATH, map_location="cuda:0")) -# Make sure to call input = input.to(device) on any input tensors that you feed to the model -model.to(device) - - -###################################################################### -# 6. Saving ``torch.nn.DataParallel`` Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# ``torch.nn.DataParallel`` is a model wrapper that enables parallel GPU -# utilization. -# -# To save a ``DataParallel`` model generically, save the -# ``model.module.state_dict()``. This way, you have the flexibility to -# load the model any way you want to any device you want. -# - -# Save -torch.save(net.module.state_dict(), PATH) - -# Load to whatever device you want - - -###################################################################### -# Congratulations! You have successfully saved and loaded models across -# devices in PyTorch. -# diff --git a/recipes_source/recipes/save_load_across_devices.rst b/recipes_source/recipes/save_load_across_devices.rst new file mode 100644 index 00000000000..fbda1562201 --- /dev/null +++ b/recipes_source/recipes/save_load_across_devices.rst @@ -0,0 +1,10 @@ +Save Load Across Devices +======================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py deleted file mode 100644 index 31b14f3a28a..00000000000 --- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Saving and loading a general checkpoint in PyTorch -================================================== -Saving and loading a general checkpoint model for inference or -resuming training can be helpful for picking up where you last left off. -When saving a general checkpoint, you must save more than just the -model’s state_dict. It is important to also save the optimizer’s -state_dict, as this contains buffers and parameters that are updated as -the model trains. Other items that you may want to save are the epoch -you left off on, the latest recorded training loss, external -``torch.nn.Embedding`` layers, and more, based on your own algorithm. - -Introduction ------------- -To save multiple checkpoints, you must organize them in a dictionary and -use ``torch.save()`` to serialize the dictionary. A common PyTorch -convention is to save these checkpoints using the ``.tar`` file -extension. To load the items, first initialize the model and optimizer, -then load the dictionary locally using torch.load(). From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. - -In this recipe, we will explore how to save and load multiple -checkpoints. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -:: - - pip install torch - - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save the general checkpoint -# 5. Load the general checkpoint -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Additional information -EPOCH = 5 -PATH = "model.pt" -LOSS = 0.4 - -torch.save({ - 'epoch': EPOCH, - 'model_state_dict': net.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - 'loss': LOSS, - }, PATH) - - -###################################################################### -# 5. Load the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the model and optimizer, then load the -# dictionary locally. -# - -model = Net() -optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH) -model.load_state_dict(checkpoint['model_state_dict']) -optimizer.load_state_dict(checkpoint['optimizer_state_dict']) -epoch = checkpoint['epoch'] -loss = checkpoint['loss'] - -model.eval() -# - or - -model.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded a general -# checkpoint for inference and/or resuming training in PyTorch. -# diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst new file mode 100644 index 00000000000..b868c26a6cd --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst @@ -0,0 +1,10 @@ +Saving And Loading A General Checkpoint +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.py b/recipes_source/recipes/saving_and_loading_models_for_inference.py deleted file mode 100644 index cd24b77c1de..00000000000 --- a/recipes_source/recipes/saving_and_loading_models_for_inference.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Saving and loading models for inference in PyTorch -================================================== -There are two approaches for saving and loading models for inference in -PyTorch. The first is saving and loading the ``state_dict``, and the -second is saving and loading the entire model. - -Introduction ------------- -Saving the model’s ``state_dict`` with the ``torch.save()`` function -will give you the most flexibility for restoring the model later. This -is the recommended method for saving models, because it is only really -necessary to save the trained model’s learned parameters. -When saving and loading an entire model, you save the entire module -using Python’s -`pickle `__ module. Using -this approach yields the most intuitive syntax and involves the least -amount of code. The disadvantage of this approach is that the serialized -data is bound to the specific classes and the exact directory structure -used when the model is saved. The reason for this is because pickle does -not save the model class itself. Rather, it saves a path to the file -containing the class, which is used during load time. Because of this, -your code can break in various ways when used in other projects or after -refactors. -In this recipe, we will explore both ways on how to save and load models -for inference. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - - -:: - - pip install torch - - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save and load the model via ``state_dict`` -# 5. Save and load the entire model -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save and load the model via ``state_dict`` -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s save and load our model using just ``state_dict``. -# - -# Specify a path -PATH = "state_dict_model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -model = Net() -model.load_state_dict(torch.load(PATH)) -model.eval() - - -###################################################################### -# A common PyTorch convention is to save models using either a ``.pt`` or -# ``.pth`` file extension. -# -# Notice that the ``load_state_dict()`` function takes a dictionary -# object, NOT a path to a saved object. This means that you must -# deserialize the saved state_dict before you pass it to the -# ``load_state_dict()`` function. For example, you CANNOT load using -# ``model.load_state_dict(PATH)``. -# -# Remember too, that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# Failing to do this will yield inconsistent inference results. -# -# 5. Save and load entire model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now let’s try the same thing with the entire model. -# - -# Specify a path -PATH = "entire_model.pt" - -# Save -torch.save(net, PATH) - -# Load -model = torch.load(PATH) -model.eval() - - -###################################################################### -# Again here, remember that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# -# Congratulations! You have successfully saved and load models for -# inference in PyTorch. -# -# Learn More -# ---------- -# -# Take a look at these other recipes to continue your learning: -# -# - `Saving and loading a general checkpoint in PyTorch `__ -# - `Saving and loading multiple models in one file using PyTorch `__ diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.rst b/recipes_source/recipes/saving_and_loading_models_for_inference.rst new file mode 100644 index 00000000000..19e1405dd81 --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_models_for_inference.rst @@ -0,0 +1,10 @@ +Saving And Loading Models For Inference +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py deleted file mode 100644 index f468d7ac6a1..00000000000 --- a/recipes_source/recipes/saving_multiple_models_in_one_file.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Saving and loading multiple models in one file using PyTorch -============================================================ -Saving and loading multiple models can be helpful for reusing models -that you have previously trained. - -Introduction ------------- -When saving a model comprised of multiple ``torch.nn.Modules``, such as -a GAN, a sequence-to-sequence model, or an ensemble of models, you must -save a dictionary of each model’s state_dict and corresponding -optimizer. You can also save any other items that may aid you in -resuming training by simply appending them to the dictionary. -To load the models, first initialize the models and optimizers, then -load the dictionary locally using ``torch.load()``. From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. -In this recipe, we will demonstrate how to save multiple models to one -file using PyTorch. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -.. code-block:: sh - - pip install torch - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save multiple models -# 5. Load multiple models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. Build -# two variables for the models to eventually save. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -netA = Net() -netB = Net() - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum to build an optimizer for each model we -# created. -# - -optimizerA = optim.SGD(netA.parameters(), lr=0.001, momentum=0.9) -optimizerB = optim.SGD(netB.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Specify a path to save to -PATH = "model.pt" - -torch.save({ - 'modelA_state_dict': netA.state_dict(), - 'modelB_state_dict': netB.state_dict(), - 'optimizerA_state_dict': optimizerA.state_dict(), - 'optimizerB_state_dict': optimizerB.state_dict(), - }, PATH) - - -###################################################################### -# 4. Load multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the models and optimizers, then load the -# dictionary locally. -# - -modelA = Net() -modelB = Net() -optimModelA = optim.SGD(modelA.parameters(), lr=0.001, momentum=0.9) -optimModelB = optim.SGD(modelB.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH) -modelA.load_state_dict(checkpoint['modelA_state_dict']) -modelB.load_state_dict(checkpoint['modelB_state_dict']) -optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) -optimizerB.load_state_dict(checkpoint['optimizerB_state_dict']) - -modelA.eval() -modelB.eval() -# - or - -modelA.train() -modelB.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded multiple models -# in PyTorch. -# diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.rst b/recipes_source/recipes/saving_multiple_models_in_one_file.rst new file mode 100644 index 00000000000..33040e6c87b --- /dev/null +++ b/recipes_source/recipes/saving_multiple_models_in_one_file.rst @@ -0,0 +1,10 @@ +Saving Multiple Models In One File +================================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index dc1daae2584..dd13b47a6dd 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -8,10 +8,38 @@ techniques often can be implemented by changing only a few lines of code and can be applied to a wide range of deep learning models across all domains. +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * General optimization techniques for PyTorch models + * CPU-specific performance optimizations + * GPU acceleration strategies + * Distributed training optimizations + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.0 or later + * Python 3.8 or later + * CUDA-capable GPU (recommended for GPU optimizations) + * Linux, macOS, or Windows operating system + +Overview +-------- + +Performance optimization is crucial for efficient deep learning model training and inference. +This tutorial covers a comprehensive set of techniques to accelerate PyTorch workloads across +different hardware configurations and use cases. + General optimizations --------------------- """ +import torch +import torchvision + ############################################################################### # Enable asynchronous data loading and augmentation # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -90,7 +118,7 @@ # setting it to zero, for more details refer to the # `documentation `_. # -# Alternatively, starting from PyTorch 1.7, call ``model`` or +# Alternatively, call ``model`` or # ``optimizer.zero_grad(set_to_none=True)``. ############################################################################### @@ -129,7 +157,7 @@ def gelu(x): ############################################################################### # Enable channels_last memory format for computer vision models # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# PyTorch 1.5 introduced support for ``channels_last`` memory format for +# PyTorch supports ``channels_last`` memory format for # convolutional networks. This format is meant to be used in conjunction with # `AMP `_ to further accelerate # convolutional neural networks with @@ -250,65 +278,6 @@ def gelu(x): # # export LD_PRELOAD=:$LD_PRELOAD -############################################################################### -# Use oneDNN Graph with TorchScript for inference -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# oneDNN Graph can significantly boost inference performance. It fuses some compute-intensive operations such as convolution, matmul with their neighbor operations. -# In PyTorch 2.0, it is supported as a beta feature for ``Float32`` & ``BFloat16`` data-types. -# oneDNN Graph receives the model’s graph and identifies candidates for operator-fusion with respect to the shape of the example input. -# A model should be JIT-traced using an example input. -# Speed-up would then be observed after a couple of warm-up iterations for inputs with the same shape as the example input. -# The example code-snippets below are for resnet50, but they can very well be extended to use oneDNN Graph with custom models as well. - -# Only this extra line of code is required to use oneDNN Graph -torch.jit.enable_onednn_fusion(True) - -############################################################################### -# Using the oneDNN Graph API requires just one extra line of code for inference with Float32. -# If you are using oneDNN Graph, please avoid calling ``torch.jit.optimize_for_inference``. - -# sample input should be of the same shape as expected inputs -sample_input = [torch.rand(32, 3, 224, 224)] -# Using resnet50 from torchvision in this example for illustrative purposes, -# but the line below can indeed be modified to use custom models as well. -model = getattr(torchvision.models, "resnet50")().eval() -# Tracing the model with example input -traced_model = torch.jit.trace(model, sample_input) -# Invoking torch.jit.freeze -traced_model = torch.jit.freeze(traced_model) - -############################################################################### -# Once a model is JIT-traced with a sample input, it can then be used for inference after a couple of warm-up runs. - -with torch.no_grad(): - # a couple of warm-up runs - traced_model(*sample_input) - traced_model(*sample_input) - # speedup would be observed after warm-up runs - traced_model(*sample_input) - -############################################################################### -# While the JIT fuser for oneDNN Graph also supports inference with ``BFloat16`` datatype, -# performance benefit with oneDNN Graph is only exhibited by machines with AVX512_BF16 -# instruction set architecture (ISA). -# The following code snippets serves as an example of using ``BFloat16`` datatype for inference with oneDNN Graph: - -# AMP for JIT mode is enabled by default, and is divergent with its eager mode counterpart -torch._C._jit_set_autocast_mode(False) - -with torch.no_grad(), torch.cpu.amp.autocast(cache_enabled=False, dtype=torch.bfloat16): - # Conv-BatchNorm folding for CNN-based Vision Models should be done with ``torch.fx.experimental.optimization.fuse`` when AMP is used - import torch.fx.experimental.optimization as optimization - # Please note that optimization.fuse need not be called when AMP is not used - model = optimization.fuse(model) - model = torch.jit.trace(model, (example_input)) - model = torch.jit.freeze(model) - # a couple of warm-up runs - model(example_input) - model(example_input) - # speedup would be observed in subsequent runs. - model(example_input) - ############################################################################### # Train a model on CPU with PyTorch ``DistributedDataParallel``(DDP) functionality @@ -426,9 +395,8 @@ def gelu(x): # * enable AMP # # * Introduction to Mixed Precision Training and AMP: -# `video `_, # `slides `_ -# * native PyTorch AMP is available starting from PyTorch 1.6: +# * native PyTorch AMP is available: # `documentation `_, # `examples `_, # `tutorial `_ @@ -536,3 +504,31 @@ def gelu(x): # approximately constant number of tokens (and variable number of sequences in a # batch), other models solve imbalance by bucketing samples with similar # sequence length or even by sorting dataset by sequence length. + +############################################################################### +# Conclusion +# ---------- +# +# This tutorial covered a comprehensive set of performance optimization techniques +# for PyTorch models. The key takeaways include: +# +# * **General optimizations**: Enable async data loading, disable gradients for +# inference, fuse operations with ``torch.compile``, and use efficient memory formats +# * **CPU optimizations**: Leverage NUMA controls, optimize OpenMP settings, and +# use efficient memory allocators +# * **GPU optimizations**: Enable Tensor cores, use CUDA graphs, enable cuDNN +# autotuner, and implement mixed precision training +# * **Distributed optimizations**: Use DistributedDataParallel, optimize gradient +# synchronization, and balance workloads across devices +# +# Many of these optimizations can be applied with minimal code changes and provide +# significant performance improvements across a wide range of deep learning models. +# +# Further Reading +# --------------- +# +# * `PyTorch Performance Tuning Documentation `_ +# * `CUDA Best Practices `_ +# * `Distributed Training Documentation `_ +# * `Mixed Precision Training `_ +# * `torch.compile Tutorial `_ diff --git a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py index 40aeeea9db8..a0752bfc67d 100644 --- a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py +++ b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py @@ -124,7 +124,7 @@ def forward(self, x): # are loading into. # -netB.load_state_dict(torch.load(PATH), strict=False) +netB.load_state_dict(torch.load(PATH, weights_only=True), strict=False) ###################################################################### diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py index 0914edbf558..a4f80354961 100644 --- a/recipes_source/recipes/zeroing_out_gradients.py +++ b/recipes_source/recipes/zeroing_out_gradients.py @@ -182,7 +182,7 @@ def forward(self, x): # ``optimizer.zero_grad()`` as long as all your model parameters are in # that optimizer. Use your best judgment to decide which one to use. # -# Congratulations! You have successfully zeroed out gradients PyTorch. +# Congratulations! You have successfully zeroed out gradients in PyTorch. # # Learn More # ---------- diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst deleted file mode 100644 index 0ae5dd06e2c..00000000000 --- a/recipes_source/recipes_index.rst +++ /dev/null @@ -1,444 +0,0 @@ -PyTorch Recipes ---------------------------------------------- -Recipes are bite-sized, actionable examples of how to use specific PyTorch features, different from our full-length tutorials. - -.. raw:: html - -
-
- -
- - - -
- -
- -
-
- -.. Add recipe cards below this line - -.. Basics - -.. customcarditem:: - :header: Defining a Neural Network - :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network for the MNIST dataset. - :image: ../_static/img/thumbnails/cropped/defining-a-network.PNG - :link: ../recipes/recipes/defining_a_neural_network.html - :tags: Basics - -.. customcarditem:: - :header: What is a state_dict in PyTorch - :card_description: Learn how state_dict objects and Python dictionaries are used in saving or loading models from PyTorch. - :image: ../_static/img/thumbnails/cropped/what-is-a-state-dict.PNG - :link: ../recipes/recipes/what_is_state_dict.html - :tags: Basics - -.. customcarditem:: - :header: Saving and loading models for inference in PyTorch - :card_description: Learn about the two approaches for saving and loading models for inference in PyTorch - via the state_dict and via the entire model. - :image: ../_static/img/thumbnails/cropped/saving-and-loading-models-for-inference.PNG - :link: ../recipes/recipes/saving_and_loading_models_for_inference.html - :tags: Basics - - -.. customcarditem:: - :header: Saving and loading a general checkpoint in PyTorch - :card_description: Saving and loading a general checkpoint model for inference or resuming training can be helpful for picking up where you last left off. In this recipe, explore how to save and load multiple checkpoints. - :image: ../_static/img/thumbnails/cropped/saving-and-loading-general-checkpoint.PNG - :link: ../recipes/recipes/saving_and_loading_a_general_checkpoint.html - :tags: Basics - -.. customcarditem:: - :header: Saving and loading multiple models in one file using PyTorch - :card_description: In this recipe, learn how saving and loading multiple models can be helpful for reusing models that you have previously trained. - :image: ../_static/img/thumbnails/cropped/saving-multiple-models.PNG - :link: ../recipes/recipes/saving_multiple_models_in_one_file.html - :tags: Basics - -.. customcarditem:: - :header: Warmstarting model using parameters from a different model in PyTorch - :card_description: Learn how warmstarting the training process by partially loading a model or loading a partial model can help your model converge much faster than training from scratch. - :image: ../_static/img/thumbnails/cropped/warmstarting-models.PNG - :link: ../recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html - :tags: Basics - -.. customcarditem:: - :header: Saving and loading models across devices in PyTorch - :card_description: Learn how saving and loading models across devices (CPUs and GPUs) is relatively straightforward using PyTorch. - :image: ../_static/img/thumbnails/cropped/saving-and-loading-models-across-devices.PNG - :link: ../recipes/recipes/save_load_across_devices.html - :tags: Basics - -.. customcarditem:: - :header: Zeroing out gradients in PyTorch - :card_description: Learn when you should zero out gradients and how doing so can help increase the accuracy of your model. - :image: ../_static/img/thumbnails/cropped/zeroing-out-gradients.PNG - :link: ../recipes/recipes/zeroing_out_gradients.html - :tags: Basics - -.. customcarditem:: - :header: PyTorch Benchmark - :card_description: Learn how to use PyTorch's benchmark module to measure and compare the performance of your code - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/recipes/benchmark.html - :tags: Basics - -.. customcarditem:: - :header: PyTorch Benchmark (quick start) - :card_description: Learn how to measure snippet run times and collect instructions. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/recipes/timer_quick_start.html - :tags: Basics - -.. customcarditem:: - :header: PyTorch Profiler - :card_description: Learn how to use PyTorch's profiler to measure operators time and memory consumption - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/recipes/profiler_recipe.html - :tags: Basics - -.. customcarditem:: - :header: PyTorch Profiler with Instrumentation and Tracing Technology API (ITT API) support - :card_description: Learn how to use PyTorch's profiler with Instrumentation and Tracing Technology API (ITT API) to visualize operators labeling in Intel® VTune™ Profiler GUI - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/profile_with_itt.html - :tags: Basics - -.. customcarditem:: - :header: Torch Compile IPEX Backend - :card_description: Learn how to use torch.compile IPEX backend - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/torch_compile_backend_ipex.html - :tags: Basics - -.. customcarditem:: - :header: Reasoning about Shapes in PyTorch - :card_description: Learn how to use the meta device to reason about shapes in your model. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/recipes/reasoning_about_shapes.html - :tags: Basics - -.. customcarditem:: - :header: Tips for Loading an nn.Module from a Checkpoint - :card_description: Learn tips for loading an nn.Module from a checkpoint. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/recipes/module_load_state_dict_tips.html - :tags: Basics - -.. customcarditem:: - :header: (beta) Using TORCH_LOGS to observe torch.compile - :card_description: Learn how to use the torch logging APIs to observe the compilation process. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/torch_logs.html - :tags: Basics - -.. customcarditem:: - :header: Extension points in nn.Module for loading state_dict and tensor subclasses - :card_description: New extension points in nn.Module. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/recipes/swap_tensors.html - :tags: Basics - - -.. Interpretability - -.. customcarditem:: - :header: Model Interpretability using Captum - :card_description: Learn how to use Captum attribute the predictions of an image classifier to their corresponding image features and visualize the attribution results. - :image: ../_static/img/thumbnails/cropped/model-interpretability-using-captum.png - :link: ../recipes/recipes/Captum_Recipe.html - :tags: Interpretability,Captum - -.. customcarditem:: - :header: How to use TensorBoard with PyTorch - :card_description: Learn basic usage of TensorBoard with PyTorch, and how to visualize data in TensorBoard UI - :image: ../_static/img/thumbnails/tensorboard_scalars.png - :link: ../recipes/recipes/tensorboard_with_pytorch.html - :tags: Visualization,TensorBoard - -.. Quantization - -.. customcarditem:: - :header: Dynamic Quantization - :card_description: Apply dynamic quantization to a simple LSTM model. - :image: ../_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png - :link: ../recipes/recipes/dynamic_quantization.html - :tags: Quantization,Text,Model-Optimization - - -.. Production Development - -.. customcarditem:: - :header: TorchScript for Deployment - :card_description: Learn how to export your trained model in TorchScript format and how to load your TorchScript model in C++ and do inference. - :image: ../_static/img/thumbnails/cropped/torchscript_overview.png - :link: ../recipes/torchscript_inference.html - :tags: TorchScript - -.. customcarditem:: - :header: Deploying with Flask - :card_description: Learn how to use Flask, a lightweight web server, to quickly setup a web API from your trained PyTorch model. - :image: ../_static/img/thumbnails/cropped/using-flask-create-restful-api.png - :link: ../recipes/deployment_with_flask.html - :tags: Production,TorchScript - -.. customcarditem:: - :header: PyTorch Mobile Performance Recipes - :card_description: List of recipes for performance optimizations for using PyTorch on Mobile (Android and iOS). - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/mobile_perf.html - :tags: Mobile,Model-Optimization - -.. customcarditem:: - :header: Making Android Native Application That Uses PyTorch Android Prebuilt Libraries - :card_description: Learn how to make Android application from the scratch that uses LibTorch C++ API and uses TorchScript model with custom C++ operator. - :image: ../_static/img/thumbnails/cropped/android.png - :link: ../recipes/android_native_app_with_custom_op.html - :tags: Mobile - -.. customcarditem:: - :header: Fuse Modules recipe - :card_description: Learn how to fuse a list of PyTorch modules into a single module to reduce the model size before quantization. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/fuse.html - :tags: Mobile - -.. customcarditem:: - :header: Quantization for Mobile Recipe - :card_description: Learn how to reduce the model size and make it run faster without losing much on accuracy. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/quantization.html - :tags: Mobile,Quantization - -.. customcarditem:: - :header: Script and Optimize for Mobile - :card_description: Learn how to convert the model to TorchScipt and (optional) optimize it for mobile apps. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/script_optimized.html - :tags: Mobile - -.. customcarditem:: - :header: Model Preparation for iOS Recipe - :card_description: Learn how to add the model in an iOS project and use PyTorch pod for iOS. - :image: ../_static/img/thumbnails/cropped/ios.png - :link: ../recipes/model_preparation_ios.html - :tags: Mobile - -.. customcarditem:: - :header: Model Preparation for Android Recipe - :card_description: Learn how to add the model in an Android project and use the PyTorch library for Android. - :image: ../_static/img/thumbnails/cropped/android.png - :link: ../recipes/model_preparation_android.html - :tags: Mobile - -.. customcarditem:: - :header: Mobile Interpreter Workflow in Android and iOS - :card_description: Learn how to use the mobile interpreter on iOS and Andriod devices. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/mobile_interpreter.html - :tags: Mobile - -.. customcarditem:: - :header: Profiling PyTorch RPC-Based Workloads - :card_description: How to use the PyTorch profiler to profile RPC-based workloads. - :image: ../_static/img/thumbnails/cropped/profile.png - :link: ../recipes/distributed_rpc_profiling.html - :tags: Production - -.. Automatic Mixed Precision - -.. customcarditem:: - :header: Automatic Mixed Precision - :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs. - :image: ../_static/img/thumbnails/cropped/amp.png - :link: ../recipes/recipes/amp_recipe.html - :tags: Model-Optimization - -.. Performance - -.. customcarditem:: - :header: Performance Tuning Guide - :card_description: Tips for achieving optimal performance. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/recipes/tuning_guide.html - :tags: Model-Optimization - -.. customcarditem:: - :header: CPU launcher script for optimal performance on Intel® Xeon - :card_description: How to use launcher script for optimal runtime configurations on Intel® Xeon CPUs. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/recipes/xeon_run_cpu.html - :tags: Model-Optimization - -.. customcarditem:: - :header: PyTorch Inference Performance Tuning on AWS Graviton Processors - :card_description: Tips for achieving the best inference performance on AWS Graviton CPUs - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/inference_tuning_on_aws_graviton.html - :tags: Model-Optimization - -.. Leverage Advanced Matrix Extensions - -.. customcarditem:: - :header: Leverage Intel® Advanced Matrix Extensions - :card_description: Learn to leverage Intel® Advanced Matrix Extensions. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/amx.html - :tags: Model-Optimization - -.. (beta) Compiling the Optimizer with torch.compile - -.. customcarditem:: - :header: (beta) Compiling the Optimizer with torch.compile - :card_description: Speed up the optimizer using torch.compile - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/compiling_optimizer.html - :tags: Model-Optimization - -.. (beta) Running the compiled optimizer with an LR Scheduler - -.. customcarditem:: - :header: (beta) Running the compiled optimizer with an LR Scheduler - :card_description: Speed up training with LRScheduler and torch.compiled optimizer - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/compiling_optimizer_lr_scheduler.html - :tags: Model-Optimization - -.. Using User-Defined Triton Kernels with ``torch.compile`` - -.. customcarditem:: - :header: Using User-Defined Triton Kernels with ``torch.compile`` - :card_description: Learn how to use user-defined kernels with ``torch.compile`` - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/torch_compile_user_defined_triton_kernel_tutorial.html - :tags: Model-Optimization - -.. Compile Time Caching in ``torch.compile`` - -.. customcarditem:: - :header: Compile Time Caching in ``torch.compile`` - :card_description: Learn how to configure compile time caching in ``torch.compile`` - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/torch_compile_caching_tutorial.html - :tags: Model-Optimization - -.. Intel(R) Extension for PyTorch* - -.. customcarditem:: - :header: Intel® Extension for PyTorch* - :card_description: Introduction of Intel® Extension for PyTorch* - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/intel_extension_for_pytorch.html - :tags: Model-Optimization - -.. Intel(R) Neural Compressor for PyTorch* - -.. customcarditem:: - :header: Intel® Neural Compressor for PyTorch - :card_description: Ease-of-use quantization for PyTorch with Intel® Neural Compressor. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/intel_neural_compressor_for_pytorch.html - :tags: Quantization,Model-Optimization - -.. Distributed Training - -.. customcarditem:: - :header: Getting Started with DeviceMesh - :card_description: Learn how to use DeviceMesh - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/distributed_device_mesh.html - :tags: Distributed-Training - -.. customcarditem:: - :header: Shard Optimizer States with ZeroRedundancyOptimizer - :card_description: How to use ZeroRedundancyOptimizer to reduce memory consumption. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/zero_redundancy_optimizer.html - :tags: Distributed-Training - -.. customcarditem:: - :header: Direct Device-to-Device Communication with TensorPipe RPC - :card_description: How to use RPC with direct GPU-to-GPU communication. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/cuda_rpc.html - :tags: Distributed-Training - -.. customcarditem:: - :header: Distributed Optimizer with TorchScript support - :card_description: How to enable TorchScript support for Distributed Optimizer. - :image: ../_static/img/thumbnails/cropped/profiler.png - :link: ../recipes/distributed_optim_torchscript.html - :tags: Distributed-Training,TorchScript - -.. customcarditem:: - :header: Getting Started with Distributed Checkpoint (DCP) - :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package. - :image: ../_static/img/thumbnails/cropped/Getting-Started-with-DCP.png - :link: ../recipes/distributed_checkpoint_recipe.html - :tags: Distributed-Training - -.. TorchServe - -.. customcarditem:: - :header: Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint - :card_description: Learn how to deploy model in Vertex AI with TorchServe - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/torchserve_vertexai_tutorial.html - :tags: Production - -.. End of tutorial card section - -.. raw:: html - -
- - - -
- -
- -.. ----------------------------------------- -.. Page TOC -.. ----------------------------------------- -.. toctree:: - :hidden: - - /recipes/recipes/defining_a_neural_network - /recipes/torch_logs - /recipes/recipes/what_is_state_dict - /recipes/recipes/saving_and_loading_models_for_inference - /recipes/recipes/saving_and_loading_a_general_checkpoint - /recipes/recipes/saving_multiple_models_in_one_file - /recipes/recipes/warmstarting_model_using_parameters_from_a_different_model - /recipes/recipes/save_load_across_devices - /recipes/recipes/zeroing_out_gradients - /recipes/recipes/profiler_recipe - /recipes/recipes/profile_with_itt - /recipes/recipes/Captum_Recipe - /recipes/recipes/tensorboard_with_pytorch - /recipes/recipes/dynamic_quantization - /recipes/recipes/amp_recipe - /recipes/recipes/tuning_guide - /recipes/recipes/xeon_run_cpu - /recipes/recipes/intel_extension_for_pytorch - /recipes/compiling_optimizer - /recipes/torch_compile_backend_ipex - /recipes/torchscript_inference - /recipes/deployment_with_flask - /recipes/distributed_rpc_profiling - /recipes/zero_redundancy_optimizer - /recipes/cuda_rpc - /recipes/distributed_optim_torchscript - /recipes/mobile_interpreter diff --git a/recipes_source/regional_aot.py b/recipes_source/regional_aot.py new file mode 100644 index 00000000000..cee8465d9bc --- /dev/null +++ b/recipes_source/regional_aot.py @@ -0,0 +1,241 @@ + +""" +Reducing AoT cold start compilation time with regional compilation +============================================================================ + +**Author:** `Sayak Paul `_, `Charles Bensimon `_, `Angela Yi `_ + +In the `regional compilation recipe `__, we showed +how to reduce cold start compilation times while retaining (almost) full compilation benefits. This was demonstrated for +just-in-time (JIT) compilation. + +This recipe shows how to apply similar principles when compiling a model ahead-of-time (AoT). If you +are not familiar with AOTInductor and ``torch.export``, we recommend you to check out `this tutorial `__. + +Prerequisites +---------------- + +* Pytorch 2.6 or later +* Familiarity with regional compilation +* Familiarity with AOTInductor and ``torch.export`` + +Setup +----- +Before we begin, we need to install ``torch`` if it is not already +available. + +.. code-block:: sh + + pip install torch +""" + +###################################################################### +# Steps +# ----- +# +# In this recipe, we will follow the same steps as the regional compilation recipe mentioned above: +# +# 1. Import all necessary libraries. +# 2. Define and initialize a neural network with repeated regions. +# 3. Measure the compilation time of the full model and the regional compilation with AoT. +# +# First, let's import the necessary libraries for loading our data: +# + +import torch +torch.set_grad_enabled(False) + +from time import perf_counter + +################################################################################### +# Defining the Neural Network +# --------------------------- +# +# We will use the same neural network structure as the regional compilation recipe. +# +# We will use a network, composed of repeated layers. This mimics a +# large language model, that typically is composed of many Transformer blocks. In this recipe, +# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region. +# We will then create a ``Model`` which is composed of 64 instances of this +# ``Layer`` class. +# +class Layer(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(10, 10) + self.relu1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(10, 10) + self.relu2 = torch.nn.ReLU() + + def forward(self, x): + a = self.linear1(x) + a = self.relu1(a) + a = torch.sigmoid(a) + b = self.linear2(a) + b = self.relu2(b) + return b + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + self.layers = torch.nn.ModuleList([Layer() for _ in range(64)]) + + def forward(self, x): + # In regional compilation, the self.linear is outside of the scope of ``torch.compile``. + x = self.linear(x) + for layer in self.layers: + x = layer(x) + return x + + +################################################################################## +# Compiling the model ahead-of-time +# --------------------------------- +# +# Since we're compiling the model ahead-of-time, we need to prepare representative +# input examples, that we expect the model to see during actual deployments. +# +# Let's create an instance of ``Model`` and pass it some sample input data. +# + +model = Model().cuda() +input = torch.randn(10, 10, device="cuda") +output = model(input) +print(f"{output.shape=}") + +############################################################################################### +# Now, let's compile our model ahead-of-time. We will use ``input`` created above to pass +# to ``torch.export``. This will yield a ``torch.export.ExportedProgram`` which we can compile. + +path = torch._inductor.aoti_compile_and_package( + torch.export.export(model, args=(input,)) +) + +################################################################# +# We can load from this ``path`` and use it to perform inference. + +compiled_binary = torch._inductor.aoti_load_package(path) +output_compiled = compiled_binary(input) +print(f"{output_compiled.shape=}") + +###################################################################################### +# Compiling _regions_ of the model ahead-of-time +# ---------------------------------------------- +# +# Compiling model regions ahead-of-time, on the other hand, requires a few key changes. +# +# Since the compute pattern is shared by all the blocks that +# are repeated in a model (``Layer`` instances in this cases), we can just +# compile a single block and let the inductor reuse it. + +model = Model().cuda() +path = torch._inductor.aoti_compile_and_package( + torch.export.export(model.layers[0], args=(input,)), + inductor_configs={ + # compile artifact w/o saving params in the artifact + "aot_inductor.package_constants_in_so": False, + } +) + +################################################### +# An exported program (``torch.export.ExportedProgram``) contains the Tensor computation, +# a ``state_dict`` containing tensor values of all lifted parameters and buffer alongside +# other metadata. We specify the ``aot_inductor.package_constants_in_so`` to be ``False`` to +# not serialize the model parameters in the generated artifact. +# +# Now, when loading the compiled binary, we can reuse the existing parameters of +# each block. This lets us take advantage of the compiled binary obtained above. +# + +for layer in model.layers: + compiled_layer = torch._inductor.aoti_load_package(path) + compiled_layer.load_constants( + layer.state_dict(), check_full_update=True, user_managed=True + ) + layer.forward = compiled_layer + +output_regional_compiled = model(input) +print(f"{output_regional_compiled.shape=}") + +##################################################### +# Just like JIT regional compilation, compiling regions within a model ahead-of-time +# leads to significantly reduced cold start times. The actual number will vary from +# model to model. +# +# Even though full model compilation offers the fullest scope of optimizations, +# for practical purposes and depending on the type of model, we have seen regional +# compilation (both JiT and AoT) providing similar speed benefits, while drastically +# reducing the cold start times. + +################################################### +# Measuring compilation time +# -------------------------- +# Next, let's measure the compilation time of the full model and the regional compilation. +# + +def measure_compile_time(input, regional=False): + start = perf_counter() + model = aot_compile_load_model(regional=regional) + torch.cuda.synchronize() + end = perf_counter() + # make sure the model works. + _ = model(input) + return end - start + +def aot_compile_load_model(regional=False) -> torch.nn.Module: + input = torch.randn(10, 10, device="cuda") + model = Model().cuda() + + inductor_configs = {} + if regional: + inductor_configs = {"aot_inductor.package_constants_in_so": False} + + # Reset the compiler caches to ensure no reuse between different runs + torch.compiler.reset() + with torch._inductor.utils.fresh_inductor_cache(): + path = torch._inductor.aoti_compile_and_package( + torch.export.export( + model.layers[0] if regional else model, + args=(input,) + ), + inductor_configs=inductor_configs, + ) + + if regional: + for layer in model.layers: + compiled_layer = torch._inductor.aoti_load_package(path) + compiled_layer.load_constants( + layer.state_dict(), check_full_update=True, user_managed=True + ) + layer.forward = compiled_layer + else: + model = torch._inductor.aoti_load_package(path) + return model + +input = torch.randn(10, 10, device="cuda") +full_model_compilation_latency = measure_compile_time(input, regional=False) +print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds") + +regional_compilation_latency = measure_compile_time(input, regional=True) +print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds") + +assert regional_compilation_latency < full_model_compilation_latency + +############################################################################ +# There may also be layers in a model incompatible with compilation. So, +# full compilation will result in a fragmented computation graph resulting +# in potential latency degradation. In these case, regional compilation +# can be beneficial. +# + +############################################################################ +# Conclusion +# ----------- +# +# This recipe shows how to control the cold start time when compiling your +# model ahead-of-time. This becomes effective when your model has repeated +# blocks, which is typically seen in large generative models. We used this +# recipe on various models to speed up real-time performance. Learn more +# `here `__. diff --git a/recipes_source/regional_compilation.py b/recipes_source/regional_compilation.py new file mode 100644 index 00000000000..0a665e04b52 --- /dev/null +++ b/recipes_source/regional_compilation.py @@ -0,0 +1,178 @@ +""" +Reducing torch.compile cold start compilation time with regional compilation +============================================================================ + +**Author:** `Animesh Jain `_ + +As deep learning models get larger, the compilation time of these models also +increases. This extended compilation time can result in a large startup time in +inference services or wasted resources in large-scale training. This recipe +shows an example of how to reduce the cold start compilation time by choosing to +compile a repeated region of the model instead of the entire model. + +Prerequisites +---------------- + +* Pytorch 2.5 or later + +Setup +----- +Before we begin, we need to install ``torch`` if it is not already +available. + +.. code-block:: sh + + pip install torch + +.. note:: + This feature is available starting with the 2.5 release. If you are using version 2.4, + you can enable the configuration flag ``torch._dynamo.config.inline_inbuilt_nn_modules=True`` + to prevent recompilations during regional compilation. In version 2.5, this flag is enabled by default. +""" + +from time import perf_counter + +###################################################################### +# Steps +# ----- +# +# In this recipe, we will follow these steps: +# +# 1. Import all necessary libraries. +# 2. Define and initialize a neural network with repeated regions. +# 3. Understand the difference between the full model and the regional compilation. +# 4. Measure the compilation time of the full model and the regional compilation. +# +# First, let's import the necessary libraries for loading our data: +# +# +# + +import torch +import torch.nn as nn + + +########################################################## +# Next, let's define and initialize a neural network with repeated regions. +# +# Typically, neural networks are composed of repeated layers. For example, a +# large language model is composed of many Transformer blocks. In this recipe, +# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region. +# We will then create a ``Model`` which is composed of 64 instances of this +# ``Layer`` class. +# +class Layer(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(10, 10) + self.relu1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(10, 10) + self.relu2 = torch.nn.ReLU() + + def forward(self, x): + a = self.linear1(x) + a = self.relu1(a) + a = torch.sigmoid(a) + b = self.linear2(a) + b = self.relu2(b) + return b + + +class Model(torch.nn.Module): + def __init__(self, apply_regional_compilation): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + # Apply compile only to the repeated layers. + if apply_regional_compilation: + self.layers = torch.nn.ModuleList( + [torch.compile(Layer()) for _ in range(64)] + ) + else: + self.layers = torch.nn.ModuleList([Layer() for _ in range(64)]) + + def forward(self, x): + # In regional compilation, the self.linear is outside of the scope of `torch.compile`. + x = self.linear(x) + for layer in self.layers: + x = layer(x) + return x + + +#################################################### +# Next, let's review the difference between the full model and the regional compilation. +# +# In full model compilation, the entire model is compiled as a whole. This is the common approach +# most users take with ``torch.compile``. In this example, we apply ``torch.compile`` to +# the ``Model`` object. This will effectively inline the 64 layers, producing a +# large graph to compile. You can look at the full graph by running this recipe +# with ``TORCH_LOGS=graph_code``. +# +# + +model = Model(apply_regional_compilation=False).cuda() +full_compiled_model = torch.compile(model) + + +################################################### +# The regional compilation, on the other hand, compiles a region of the model. +# By strategically choosing to compile a repeated region of the model, we can compile a +# much smaller graph and then reuse the compiled graph for all the regions. +# In the example, ``torch.compile`` is applied only to the ``layers`` and not the full model. +# + +regional_compiled_model = Model(apply_regional_compilation=True).cuda() + +##################################################### +# Applying compilation to a repeated region, instead of full model, leads to +# large savings in compile time. Here, we will just compile a layer instance and +# then reuse it 64 times in the ``Model`` object. +# +# Note that with repeated regions, some part of the model might not be compiled. +# For example, the ``self.linear`` in the ``Model`` is outside of the scope of +# regional compilation. +# +# Also, note that there is a tradeoff between performance speedup and compile +# time. Full model compilation involves a larger graph and, +# theoretically, offers more scope for optimizations. However, for practical +# purposes and depending on the model, we have observed many cases with minimal +# speedup differences between the full model and regional compilation. + + +################################################### +# Next, let's measure the compilation time of the full model and the regional compilation. +# +# ``torch.compile`` is a JIT compiler, which means that it compiles on the first invocation. +# In the code below, we measure the total time spent in the first invocation. While this method is not +# precise, it provides a good estimate since the majority of the time is spent in +# compilation. + + +def measure_latency(fn, input): + # Reset the compiler caches to ensure no reuse between different runs + torch.compiler.reset() + with torch._inductor.utils.fresh_inductor_cache(): + start = perf_counter() + fn(input) + torch.cuda.synchronize() + end = perf_counter() + return end - start + + +input = torch.randn(10, 10, device="cuda") +full_model_compilation_latency = measure_latency(full_compiled_model, input) +print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds") + +regional_compilation_latency = measure_latency(regional_compiled_model, input) +print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds") + +assert regional_compilation_latency < full_model_compilation_latency + +############################################################################ +# Conclusion +# ----------- +# +# This recipe shows how to control the cold start compilation time if your model +# has repeated regions. This approach requires user modifications to apply `torch.compile` to +# the repeated regions instead of more commonly used full model compilation. We +# are continually working on reducing cold start compilation time. +# diff --git a/recipes_source/script_optimized.rst b/recipes_source/script_optimized.rst index f4384b1a3ae..ed64419ff41 100644 --- a/recipes_source/script_optimized.rst +++ b/recipes_source/script_optimized.rst @@ -1,218 +1,11 @@ Script and Optimize for Mobile Recipe ===================================== -This recipe demonstrates how to convert a PyTorch model to TorchScript which can run in a high-performance C++ environment such as iOS and Android, and how to optimize the converted TorchScript model for mobile deployment. +This tutorial has been deprecated. There is a new tutorial on this topic. -Introduction ------------- +Redirecting in 3 seconds... -After a PyTorch model is trained and optionally but preferably quantized (see `Quantization Recipe `_ for more details), one essential step before the model can be used in iOS and Android apps is to convert the Python-dependent model to TorchScript, which can then further be optimized for mobile apps. Conversion to TorchScript can be as simple as a single call, or as complicated as changing the original model in many different places. +.. raw:: html -Pre-requisites --------------- + -PyTorch 1.6.0 or 1.7.0 - -Conversion to TorchScript -------------------------- - -There are two basic ways to convert a PyTorch model to TorchScript, using `trace` and `script`. Mixing `trace` and `script` may also be needed in some cases - see `here `_ for more information. - -Use the `trace` Method -^^^^^^^^^^^^^^^^^^^^^^ - -To use the `trace` method on a model, an example or dummy input for the model needs to be specified, the actual input size needs to be the same as the example input size, and the model definition cannot have control flow such as `if` or `for`. The reason for these constraints is that running `trace` on a model with an example input simply calls the model's `forward` method with the input and all operations executed in the model layers are recorded, creating the trace of the model. - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - - -Use the `script` Method -^^^^^^^^^^^^^^^^^^^^^^^ - -For the example above, calling `script` below makes no difference: - -:: - - torchscript_model = torch.jit.script(model_quantized) - -But if a model has some flow control, then `trace` won't correctly record all the possible traces. Take some code snippet of an example model definition from `here `_ for example: - -:: - - import torch - - class MyDecisionGate(torch.nn.Module): - def forward(self, x): - if x.sum() > 0: - return x - else: - return -x - - x = torch.rand(3, 4) - traced_cell = torch.jit.trace(MyDecisionGate(), x) - print(traced_cell.code) - -The code above will output: - -:: - - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can''t record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - - if x.sum() > 0: - def forward(self, - x: Tensor) -> Tensor: - return x - - -Note that "the trace might not generalize to other inputs" warning above means that if the model has any kind of data-dependent control flow, `trace` is not the right answer. But if we replace the last two lines of the Python code snippet above (before the code output) with: - -:: - - scripted_cell = torch.jit.script(MyDecisionGate()) - print(scripted_cell.code) - -The scripted model as shown by the `print` result below will be covering all possible inputs, thus generalizing to other inputs: - -:: - - def forward(self, - x: Tensor) -> Tensor: - _0 = bool(torch.gt(torch.sum(x, dtype=None), 0)) - if _0: - _1 = x - else: - _1 = torch.neg(x) - return _1 - - -This is another example of using `trace` and `script` - it converts the model trained in the PyTorch tutorial `NLP FROM SCRATCH: TRANSLATION WITH A SEQUENCE TO SEQUENCE NETWORK AND ATTENTION `_: - -:: - - encoder = EncoderRNN(input_lang.n_words, hidden_size) - decoder = AttnDecoderRNN(hidden_size, output_lang.n_words) - - # method 1: using trace with example inputs - - encoder_input=torch.tensor([1]) - encoder_hidden=torch.zeros(1, 1, hidden_size) - - decoder_input1=torch.tensor([[0]]) - decoder_input2=torch.zeros(1, 1, hidden_size) - decoder_input3=torch.zeros(MAX_LENGTH, hidden_size) - - traced_encoder = torch.jit.trace(encoder, (encoder_input, encoder_hidden)) - traced_decoder = torch.jit.trace(decoder, (decoder_input1, decoder_input2, decoder_input3)) - - # method 2: using script - - scripted_encoder = torch.jit.script(encoder) - scripted_decoder = torch.jit.script(decoder) - -So is it true that one can simply always use the `script` call and the model is converted to TorchScript? The answer is no, because TorchScript is actually a subset of Python and to make `script` work, the PyTorch model definition must only use the language features of that TorchScript subset of Python. `TorchScript Language Reference `_ covers all the details of what is supported in TorchScript. Below we will describe some of the common errors when using the `script` method. - - -Fix Common Errors When Using the `script` Method ----------------------------------------------------- - -If you apply the `script` method to a non-trivial model, chances are you may encounter several types of errors. Check out `this tutorial `_ for a complete example of converting a chatbot model to TorchScript. But follow the steps below to fix common errors when you run the `script` method: - -1. RuntimeError `attribute lookup is not defined on python value of type` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For this error, pass the value of the model as a parameter in the constructor. This is because when calling `script` on a model that accepts another model as a parameter, the model passed is actually of type `TracedModule` or `ScriptModule`, not of type `Module`, making the the model attribute not defined when scripting. - -For example, the `LuongAttnDecoderRNN` module in the tutorial above has an attribute `n_layers`, and the `GreedySearchDecoder` module refers to the `n_layers` attribute of a `decoder` instance of the `LuongAttnDecoderRNN` module, so in order to make `script` work, the `GreedySearchDecoder` module's constructor needs to be changed from: - -:: - - def __init__(self, encoder, decoder): - -to: - -:: - - def __init__(self, encoder, decoder, decoder_n_layers): - ... - self._decoder_n_layers = decoder_n_layers - - -and the `GreedySearchDecoder`'s `forward` method needs to refer `self._decoder_n_layers` instead of `decoder.n_layers`. - -2. RuntimeError `python value of type '...' cannot be used as a value.` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The complete error message for this one continues with `Perhaps it is a closed over global variable? If so, please consider passing it in as an argument or use a local variable instead.`, store global variables' values as attributes in the model constructor (there's no need to add them to a special list called `__constants__`). The reason is that global values can be used conveniently in normal model training and inference, but the global values are not accessible during the scripting. - -For example, `device` and `SOS_token` are global variables, and to make `script` work, they need to be added to the `GreedySearchDecoder`'s constructor: - -:: - - self._device = device - self._SOS_token = SOS_token - -and referred to as `self._device` and `self._SOS_token` instead of `device` and `SOS_token` in the `GreedySearchDecoder`'s `forward` method. - -3. RuntimeError `all inputs of range must be '...', found Tensor (inferred) in argument` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The error message continues with: `add type definitions for each of the module's forward method arguments. Because all parameters to a TorchScript function are of the `torch.Tensor` type by default, you need to specifically declare the type for each parameter that is not of type 'Tensor'. For a complete list of TorchScript-supported types, see `here `_. - -For example, the `GreedySearchDecoder`'s `forward` method signature needs to be changed from: - -:: - - def forward(self, input_seq, input_length, max_length): - -to: - -:: - - def forward(self, input_seq, input_length, max_length : int): - -After using the `trace` or `script` method above, and fixing possible errors, you should have a TorchScript model ready to be optimized for mobile. - - -Optimize a TorchScript Model --------------------------------------- - -Simply run the following code snippet to optimize a TorchScript model generated with the `trace` and/or `script` method: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - optimized_torchscript_model = optimize_for_mobile(torchscript_model) - -The optimized model can then be saved and deployed in mobile apps: - -:: - - optimized_torchscript_model.save("optimized_torchscript_model.pth") - -By default, for the CPU backend, `optimize_for_mobile` performs the following types of optimizations: - -* `Conv2D and BatchNorm fusion` which folds Conv2d-BatchNorm2d into Conv2d; - -* `Insert and fold prepacked ops` which rewrites the model graph to replace 2D convolutions and linear ops with their prepacked counterparts. - -* `ReLU and hardtanh fusion` which rewrites graph by finding ReLU/hardtanh ops and fuses them together. - -* `Dropout removal` which removes dropout nodes from this module when training is false. - -* `Conv packed params hoisting` which moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics. - -For the Vulkan backend,`optimize_for_mobile` performs the following type of optimization: - -* `Automatic GPU transfer` which rewrites the graph so that moving input and output data to and from the GPU becomes part of the model. - -Optimization types can be disabled by passing an optimization blocklist as an argument to `optimize_for_mobile`. - -Learn More ------------------ -1. The official `TorchScript Language Reference `_. -2. The `torch.utils.mobile_optimizer` `API documentation `_. diff --git a/recipes_source/torch_compile_backend_ipex.rst b/recipes_source/torch_compile_backend_ipex.rst deleted file mode 100644 index 58a53b525a0..00000000000 --- a/recipes_source/torch_compile_backend_ipex.rst +++ /dev/null @@ -1,168 +0,0 @@ -Intel® Extension for PyTorch* Backend on Intel® CPUs -==================================================== - -To work better with `torch.compile` on Intel® CPUs, Intel® Extension for PyTorch* implements a backend ``ipex``. -It targets to improve hardware resource usage efficiency on Intel platforms for better performance. -The `ipex` backend is implemented with further customizations designed in Intel® Extension for -PyTorch* for the model compilation. - -Usage Example -~~~~~~~~~~~~~ - -Train FP32 ----------- - -Check the example below to learn how to utilize the `ipex` backend with `torch.compile` for model training with FP32 data type. - -.. code:: python - - import torch - import torchvision - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - - # Invoke the following API optionally, to apply frontend optimizations - model, optimizer = ipex.optimize(model, optimizer=optimizer) - - compile_model = torch.compile(model, backend="ipex") - ###################################################### - - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = compile_model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - - -Train BF16 ----------- - -Check the example below to learn how to utilize the `ipex` backend with `torch.compile` for model training with BFloat16 data type. - -.. code:: python - - import torch - import torchvision - - LR = 0.001 - DOWNLOAD = True - DATA = 'datasets/cifar10/' - - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - model = torchvision.models.resnet50() - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) - model.train() - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - - # Invoke the following API optionally, to apply frontend optimizations - model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer) - - compile_model = torch.compile(model, backend="ipex") - ###################################################### - - with torch.cpu.amp.autocast(): - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = compile_model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - - -Inference FP32 --------------- - -Check the example below to learn how to utilize the `ipex` backend with `torch.compile` for model inference with FP32 data type. - -.. code:: python - - import torch - import torchvision.models as models - - model = models.resnet50(weights='ResNet50_Weights.DEFAULT') - model.eval() - data = torch.rand(1, 3, 224, 224) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - - # Invoke the following API optionally, to apply frontend optimizations - model = ipex.optimize(model, weights_prepack=False) - - compile_model = torch.compile(model, backend="ipex") - ###################################################### - - with torch.no_grad(): - compile_model(data) - - -Inference BF16 --------------- - -Check the example below to learn how to utilize the `ipex` backend with `torch.compile` for model inference with BFloat16 data type. - -.. code:: python - - import torch - import torchvision.models as models - - model = models.resnet50(weights='ResNet50_Weights.DEFAULT') - model.eval() - data = torch.rand(1, 3, 224, 224) - - #################### code changes #################### - import intel_extension_for_pytorch as ipex - - # Invoke the following API optionally, to apply frontend optimizations - model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False) - - compile_model = torch.compile(model, backend="ipex") - ###################################################### - - with torch.no_grad(), torch.autocast(device_type="cpu", dtype=torch.bfloat16): - compile_model(data) diff --git a/recipes_source/torch_compile_caching_configuration_tutorial.rst b/recipes_source/torch_compile_caching_configuration_tutorial.rst new file mode 100644 index 00000000000..21565d0562d --- /dev/null +++ b/recipes_source/torch_compile_caching_configuration_tutorial.rst @@ -0,0 +1,78 @@ +Compile Time Caching Configuration +========================================================= +**Authors:** `Oguz Ulgen `_ and `Sam Larsen `_ + +Introduction +------------------ + +PyTorch Compiler implements several caches to reduce compilation latency. +This recipe demonstrates how you can configure various parts of the caching in ``torch.compile``. + +Prerequisites +------------------- + +Before starting this recipe, make sure that you have the following: + +* Basic understanding of ``torch.compile``. See: + + * `torch.compiler API documentation `__ + * `Introduction to torch.compile `__ + * `Compile Time Caching in torch.compile `__ + +* PyTorch 2.4 or later + +Inductor Cache Settings +---------------------------- + +Most of these caches are in-memory, only used within the same process, and are transparent to the user. An exception is caches that store compiled FX graphs (``FXGraphCache``, ``AOTAutogradCache``). These caches allow Inductor to avoid recompilation across process boundaries when it encounters the same graph with the same Tensor input shapes (and the same configuration). The default implementation stores compiled artifacts in the system temp directory. An optional feature also supports sharing those artifacts within a cluster by storing them in a Redis database. + +There are a few settings relevant to caching and to FX graph caching in particular. +The settings are accessible via environment variables listed below or can be hard-coded in the Inductor’s config file. + +TORCHINDUCTOR_FX_GRAPH_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables the local FX graph cache feature, which stores artifacts in the host’s temp directory. Setting it to ``1`` enables the feature while any other value disables it. By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below). + +TORCHINDUCTOR_AUTOGRAD_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting extends ``FXGraphCache`` to store cached results at the ``AOTAutograd`` level, rather than at the Inductor level. Setting it to ``1`` enables this feature, while any other value disables it. +By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below). +``TORCHINDUCTOR_AUTOGRAD_CACHE`` requires ``TORCHINDUCTOR_FX_GRAPH_CACHE`` to work. The same cache dir stores cache entries for ``AOTAutogradCache`` (under ``{TORCHINDUCTOR_CACHE_DIR}/aotautograd``) and ``FXGraphCache`` (under ``{TORCHINDUCTOR_CACHE_DIR}/fxgraph``). + +TORCHINDUCTOR_CACHE_DIR +~~~~~~~~~~~~~~~~~~~~~~~~ +This setting specifies the location of all on-disk caches. By default, the location is in the system temp directory under ``torchinductor_``, for example, ``/tmp/torchinductor_myusername``. + +Note that if ``TRITON_CACHE_DIR`` is not set in the environment, Inductor sets the ``Triton`` cache directory to this same temp location, under the Triton sub-directory. + +TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables the remote FX graph cache feature. The current implementation uses ``Redis``. ``1`` enables caching, and any other value disables it. The following environment variables configure the host and port of the Redis server: + +``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``) +``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``) + +.. note:: + + Note that if Inductor locates a remote cache entry, it stores the compiled artifact in the local on-disk cache; that local artifact would be served on subsequent runs on the same machine. + +TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Similar to ``TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE``, this setting enables the remote ``AOTAutogradCache`` feature. The current implementation uses Redis. Setting it to ``1`` enables caching, while any other value disables it. The following environment variables are used to configure the host and port of the ``Redis`` server: +* ``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``) +* ``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``) + +`TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE`` requires ``TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE`` to be enabled in order to function. The same Redis server can be used to store both AOTAutograd and FXGraph cache results. + +TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables a remote cache for ``TorchInductor``’s autotuner. Similar to remote FX graph cache, the current implementation uses Redis. Setting it to ``1`` enables caching, while any other value disables it. The same host / port environment variables mentioned above apply to this cache. + +TORCHINDUCTOR_FORCE_DISABLE_CACHES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Set this value to ``1`` to disable all Inductor caching. This setting is useful for tasks like experimenting with cold-start compile times or forcing recompilation for debugging purposes. + +Conclusion +------------- +In this recipe, we have learned how to configure PyTorch Compiler's caching mechanisms. Additionally, we explored the various settings and environment variables that allow users to configure and optimize these caching features according to their specific needs. + diff --git a/recipes_source/torch_compile_caching_tutorial.rst b/recipes_source/torch_compile_caching_tutorial.rst index a5b392bdd5b..e846817cbc0 100644 --- a/recipes_source/torch_compile_caching_tutorial.rst +++ b/recipes_source/torch_compile_caching_tutorial.rst @@ -1,12 +1,16 @@ Compile Time Caching in ``torch.compile`` ========================================================= -**Authors:** `Oguz Ulgen `_ and `Sam Larsen `_ +**Author:** `Oguz Ulgen `_ Introduction ------------------ -PyTorch Inductor implements several caches to reduce compilation latency. -This recipe demonstrates how you can configure various parts of the caching in ``torch.compile``. +PyTorch Compiler provides several caching offerings to reduce compilation latency. +This recipe will explain these offerings in detail to help users pick the best option for their use case. + +Check out `Compile Time Caching Configurations `__ for how to configure these caches. + +Also check out our caching benchmark at `PT CacheBench Benchmarks `__. Prerequisites ------------------- @@ -17,45 +21,87 @@ Before starting this recipe, make sure that you have the following: * `torch.compiler API documentation `__ * `Introduction to torch.compile `__ + * `Triton language documentation `__ * PyTorch 2.4 or later -Inductor Cache Settings ----------------------------- +Caching Offerings +--------------------- + +``torch.compile`` provides the following caching offerings: + +* End to end caching (also known as ``Mega-Cache``) +* Modular caching of ``TorchDynamo``, ``TorchInductor``, and ``Triton`` + +It is important to note that caching validates that the cache artifacts are used with the same PyTorch and Triton version, as well as, same GPU when device is set to be cuda. + +``torch.compile`` end-to-end caching (``Mega-Cache``) +------------------------------------------------------------ + +End to end caching, from here onwards referred to ``Mega-Cache``, is the ideal solution for users looking for a portable caching solution that can be stored in a database and can later be fetched possibly on a separate machine. + +``Mega-Cache`` provides two compiler APIs: + +* ``torch.compiler.save_cache_artifacts()`` +* ``torch.compiler.load_cache_artifacts()`` + +The intended use case is after compiling and executing a model, the user calls ``torch.compiler.save_cache_artifacts()`` which will return the compiler artifacts in a portable form. Later, potentially on a different machine, the user may call ``torch.compiler.load_cache_artifacts()`` with these artifacts to pre-populate the ``torch.compile`` caches in order to jump-start their cache. + +Consider the following example. First, compile and save the cache artifacts. -Most of these caches are in-memory, only used within the same process, and are transparent to the user. An exception is the FX graph cache that stores compiled FX graphs. This cache allows Inductor to avoid recompilation across process boundaries when it encounters the same graph with the same Tensor input shapes (and the same configuration). The default implementation stores compiled artifacts in the system temp directory. An optional feature also supports sharing those artifacts within a cluster by storing them in a Redis database. +.. code-block:: python -There are a few settings relevant to caching and to FX graph caching in particular. -The settings are accessible via environment variables listed below or can be hard-coded in Inductor’s config file. + @torch.compile + def fn(x, y): + return x.sin() @ y -TORCHINDUCTOR_FX_GRAPH_CACHE -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This setting enables the local FX graph cache feature, i.e., by storing artifacts in the host’s temp directory. ``1`` enables, and any other value disables it. By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below). + a = torch.rand(100, 100, dtype=dtype, device=device) + b = torch.rand(100, 100, dtype=dtype, device=device) -TORCHINDUCTOR_CACHE_DIR -~~~~~~~~~~~~~~~~~~~~~~~~ -This setting specifies the location of all on-disk caches. By default, the location is in the system temp directory under ``torchinductor_``, for example, ``/tmp/torchinductor_myusername``. + result = fn(a, b) -Note that if ``TRITON_CACHE_DIR`` is not set in the environment, Inductor sets the Triton cache directory to this same temp location, under the Triton subdirectory. + artifacts = torch.compiler.save_cache_artifacts() -TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This setting enables the remote FX graph cache feature. The current implementation uses Redis. ``1`` enables caching, and any other value disables it. The following environment variables configure the host and port of the Redis server: + assert artifacts is not None + artifact_bytes, cache_info = artifacts -``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``) -``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``) + # Now, potentially store artifact_bytes in a database + # You can use cache_info for logging -Note that if Inductor locates a remote cache entry, it stores the compiled artifact in the local on-disk cache; that local artifact would be served on subsequent runs on the same machine. +Later, you can jump-start the cache by the following: -TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This setting enables a remote cache for Inductor’s autotuner. As with the remote FX graph cache, the current implementation uses Redis. ``1`` enables caching, and any other value disables it. The same host / port environment variables listed above apply to this cache. +.. code-block:: python + + # Potentially download/fetch the artifacts from the database + torch.compiler.load_cache_artifacts(artifact_bytes) + +This operation populates all the modular caches that will be discussed in the next section, including ``PGO``, ``AOTAutograd``, ``Inductor``, ``Triton``, and ``Autotuning``. + + +Modular caching of ``TorchDynamo``, ``TorchInductor``, and ``Triton`` +----------------------------------------------------------- + +The aforementioned ``Mega-Cache`` is composed of individual components that can be used without any user intervention. By default, PyTorch Compiler comes with local on-disk caches for ``TorchDynamo``, ``TorchInductor``, and ``Triton``. These caches include: + +* ``FXGraphCache``: A cache of graph-based IR components used in compilation. +* ``TritonCache``: A cache of Triton-compilation results, including ``cubin`` files generated by ``Triton`` and other caching artifacts. +* ``InductorCache``: A bundle of ``FXGraphCache`` and ``Triton`` cache. +* ``AOTAutogradCache``: A cache of joint graph artifacts. +* ``PGO-cache``: A cache of dynamic shape decisions to reduce number of recompilations. +* `AutotuningCache `__: + * ``Inductor`` generates ``Triton`` kernels and benchmarks them to select the fastest kernels. + * ``torch.compile``'s built-in ``AutotuningCache`` caches these results. + +All these cache artifacts are written to ``TORCHINDUCTOR_CACHE_DIR`` which by default will look like ``/tmp/torchinductor_myusername``. + + +Remote Caching +---------------- + +We also provide a remote caching option for users who would like to take advantage of a Redis based cache. Check out `Compile Time Caching Configurations `__ to learn more about how to enable the Redis-based caching. -TORCHINDUCTOR_FORCE_DISABLE_CACHES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Set this value to ``1`` to disable all Inductor caching. This setting is useful for tasks like experimenting with cold-start compile times or forcing recompilation for debugging purposes. Conclusion ------------- In this recipe, we have learned that PyTorch Inductor's caching mechanisms significantly reduce compilation latency by utilizing both local and remote caches, which operate seamlessly in the background without requiring user intervention. -Additionally, we explored the various settings and environment variables that allow users to configure and optimize these caching features according to their specific needs. + diff --git a/recipes_source/torch_compile_torch_function_modes.py b/recipes_source/torch_compile_torch_function_modes.py new file mode 100644 index 00000000000..7808579563e --- /dev/null +++ b/recipes_source/torch_compile_torch_function_modes.py @@ -0,0 +1,77 @@ +""" +(beta) Utilizing Torch Function modes with torch.compile +============================================================ + +**Author:** `Michael Lazos `_ +""" + +######################################################### +# This recipe covers how to use a key torch extensibility point, +# torch function modes, in tandem with ``torch.compile`` to override +# the behavior of torch operators, also know as **ops**, at trace time, with no runtime overhead. +# +# .. note:: +# +# This recipe requires PyTorch 2.7.0 or later. + + +##################################################################### +# Rewriting a torch op (torch.add -> torch.mul) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# For this example, we'll use torch function modes to rewrite occurences +# of addition with multiply instead. This type of override can be common +# if a certain backend has a custom implementation that should be dispatched +# for a given op. +import torch + +# exit cleanly if we are on a device that doesn't support ``torch.compile`` +if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + +from torch.overrides import BaseTorchFunctionMode + +# Define our mode, Note: ``BaseTorchFunctionMode`` +# implements the actual invocation of func(..) +class AddToMultiplyMode(BaseTorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + if func == torch.Tensor.add: + func = torch.mul + + return super().__torch_function__(func, types, args, kwargs) + +@torch.compile() +def test_fn(x, y): + return x + y * x # Note: infix operators map to torch.Tensor.* methods + +x = torch.rand(2, 2) +y = torch.rand_like(x) + +with AddToMultiplyMode(): + z = test_fn(x, y) + +assert torch.allclose(z, x * y * x) + +# The mode can also be used within the compiled region as well like this: + +@torch.compile() +def test_fn(x, y): + with AddToMultiplyMode(): + return x + y * x # Note: infix operators map to torch.Tensor.* methods + +x = torch.rand(2, 2) +y = torch.rand_like(x) +z = test_fn(x, y) + +assert torch.allclose(z, x * y * x) + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# In this recipe we demonstrated how to override the behavior of ``torch.*`` operators +# using torch function modes from within ``torch.compile``. This enables users to utilize +# the extensibility benefits of torch function modes without the runtime overhead +# of calling torch function on every op invocation. +# +# * See `Extending Torch API with Modes `__ for other examples and background on Torch Function modes. diff --git a/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py index 68187d6162a..10ecd74ce91 100644 --- a/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py +++ b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py @@ -140,19 +140,220 @@ def add_fn(x, y): print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}") ###################################################################### -# Composibility and Limitations +# Composability +# ------------------------------------------------------------------- +# +# User-defined Triton kernels do not automatically support all PyTorch +# subsystems. This can be seen in the following use cases: +# +# - Adding a CPU fallback +# - Adding a ``FlopCounter`` formula +# - Composing with Tensor Subclasses +# +# To compose with additional PyTorch subsystems, use ``torch.library.triton_op``. +# +# ``triton_op is`` a structured way of defining a custom operator that is backed by one +# or more Triton kernels: like regular custom operators (``torch.library.custom_op``), +# you are able to specify the interactions with PyTorch subsystems via ``torch.library``. +# However, unlike ``torch.library.custom_op``, which creates opaque callables with respect to +# ``torch.compile``, ``torch.compile`` traces into ``triton_op`` to apply optimizations. +# +# Here’s a chart of which API to use when integrating Triton kernels with PyTorch. +# +# .. list-table:: +# :header-rows: 1 +# +# * - +# - Triton kernel (no explicit ``torch.library`` wrapper) +# - ``torch.library.triton_op`` +# - ``torch.library.custom_op`` +# * - Supports inference +# - Yes +# - Yes +# - Yes +# * - Supports training +# - In the majority of cases +# - Yes +# - Yes +# * - Supports ``torch.compile`` +# - Yes +# - Yes +# - Yes +# * - Supports ``torch.compile(fullgraph=True)`` +# - In the majority of cases +# - In the majority of cases +# - In all cases +# * - Does torch.compile trace into the implementation? +# - Yes +# - Yes +# - No +# * - Supports AOTInductor +# - Yes +# - Yes +# - No +# * - Supports PyTorch Subsystems like FlopCounterMode, CPU Fallback, Tensor Subclasses +# - No +# - Yes +# - Yes + +###################################################################### +# Wrapping Triton kernels with ``triton_op`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Use ``torch.library.triton_op`` to wrap a function that may invoke one or more Triton kernels. +# Use ``torch.library.wrap_triton`` to wrap the calls to the Triton kernel. + +from torch.library import triton_op, wrap_triton + +@triton_op("mylib::mysin", mutates_args={}) +def mysin(x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + n_elements = x.numel() + wrap_triton(sin_kernel)[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4) + return out + +@triton.jit +def sin_kernel( + in_ptr0, + out_ptr, + n_elements, + BLOCK_SIZE: "tl.constexpr", +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(in_ptr0 + offsets, mask=mask) + output = tl.sin(x) + tl.store(out_ptr + offsets, output, mask=mask) + +###################################################################### +# You can invoke the ``triton_op`` in one of the following two ways. + +x = torch.randn(3, device="cuda") +y = mysin(x) +z = torch.ops.mylib.mysin.default(x) + +assert torch.allclose(y, x.sin()) +assert torch.allclose(z, x.sin()) + +###################################################################### +# The resulting ``triton_op`` works with ``torch.compile`` and ``AOTInductor``. + +y = torch.compile(mysin)(x) +assert torch.allclose(y, x.sin()) + +###################################################################### +# Adding training support +# ^^^^^^^^^^^^^^^^^^^^^^^ +# +# Use ``register_autograd`` to add an autograd formula for the ``triton_op``. +# Prefer this to using ``torch.autograd.Function`` (which has various composability footguns +# with ``torch.compile``). + +def backward(ctx, grad): + x, = ctx.saved_tensors + return grad * x.cos() + +def setup_context(ctx, inputs, output): + x, = inputs + ctx.save_for_backward(x) + +mysin.register_autograd(backward, setup_context=setup_context) + +###################################################################### +# Note that the backward must be a composition of PyTorch-understood operators. +# If you want the backward to call Triton kernels, then those must be wrapped in ``triton_op`` as well: + +@triton.jit +def cos_kernel( + in_ptr0, + out_ptr, + n_elements, + BLOCK_SIZE: "tl.constexpr", +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(in_ptr0 + offsets, mask=mask) + output = tl.cos(x) + tl.store(out_ptr + offsets, output, mask=mask) + +@triton_op("mylib::mycos", mutates_args={}) +def mycos(x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + n_elements = x.numel() + wrap_triton(cos_kernel)[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4) + return out + +def backward(ctx, grad): + x, = ctx.saved_tensors + return grad * mycos(x) + +def setup_context(ctx, inputs, output): + x, = inputs + ctx.save_for_backward(x) + +mysin.register_autograd(backward, setup_context=setup_context) + +###################################################################### +# Adding a CPU Fallback +# ^^^^^^^^^^^^^^^^^^^^^ +# Triton kernels don’t run on CPU. Use ``register_kernel`` to add a CPU (or any other device) fallback for the ``triton_op``: + +@mysin.register_kernel("cpu") +def _(x): + return torch.sin(x) + +x = torch.randn(3) +y = mysin(x) +assert torch.allclose(y, x.sin()) + +###################################################################### +# The fallback must be composed of PyTorch operators. + +###################################################################### +# Adding a FlopCounter Formula +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# To specify how many flops the triton kernel reports under PyTorch's flop counter, +# use ``register_flop_formula``. + +from torch.utils.flop_counter import FlopCounterMode, register_flop_formula + +@register_flop_formula(torch.ops.mylib.mysin) +def _(x_shape): + numel = 1 + for s in x_shape: + numel *= s + return numel + +x = torch.randn(3, device="cuda") + +######################################################### +# ``FlopCounterMode`` requires `tabulate `__. +# Before running the code below, make sure you have ``tabulate`` installed or install by +# running ``pip install tabulate``. +# +# >>> with FlopCounterMode() as flop_counter: +# >>> y = mysin(x) + +###################################################################### +# Limitations # -------------------------------------------------------------------- # # As of PyTorch 2.3, the support for user-defined Triton kernels in ``torch.compile`` # includes dynamic shapes, ``torch.autograd.Function``, JIT inductor, and AOT inductor. # You can use these features together to build complex, high-performance models. # +# PyTorch 2.6 added ``torch.library.triton_op``, which adds support for +# user-defined Triton kernels in tensor subclasses and other advanced features. +# # However, there are certain limitations to be aware of: # -# * **Tensor Subclasses:** Currently, there is no support for -# tensor subclasses and other advanced features. # * **Triton Features:** While ``triton.heuristics`` can be used either standalone or -# before ``triton.autotune``, it cannot be used after ```triton.autotune``. This +# before ``triton.autotune``, it cannot be used after ``triton.autotune``. This # implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used # together, ``triton.heuristics`` must be used first. # diff --git a/recipes_source/torch_compiler_set_stance_tutorial.py b/recipes_source/torch_compiler_set_stance_tutorial.py new file mode 100644 index 00000000000..56b338db801 --- /dev/null +++ b/recipes_source/torch_compiler_set_stance_tutorial.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- + +""" +Dynamic Compilation Control with ``torch.compiler.set_stance`` +========================================================================= +**Author:** `William Wen `_ +""" + +###################################################################### +# ``torch.compiler.set_stance`` is a ``torch.compiler`` API that +# enables you to change the behavior of ``torch.compile`` across different +# calls to your model without having to reapply ``torch.compile`` to your model. +# +# This recipe provides some examples on how to use ``torch.compiler.set_stance``. +# +# +# .. contents:: +# :local: +# +# Prerequisites +# --------------- +# +# - ``torch >= 2.6`` + +###################################################################### +# Description +# ----------- +# ``torch.compile.set_stance`` can be used as a decorator, context manager, or raw function +# to change the behavior of ``torch.compile`` across different calls to your model. +# +# In the example below, the ``"force_eager"`` stance ignores all ``torch.compile`` directives. + +import torch + + +@torch.compile +def foo(x): + if torch.compiler.is_compiling(): + # torch.compile is active + return x + 1 + else: + # torch.compile is not active + return x - 1 + + +inp = torch.zeros(3) + +print(foo(inp)) # compiled, prints 1 + +###################################################################### +# Sample decorator usage + + +@torch.compiler.set_stance("force_eager") +def bar(x): + # force disable the compiler + return foo(x) + + +print(bar(inp)) # not compiled, prints -1 + +###################################################################### +# Sample context manager usage + +with torch.compiler.set_stance("force_eager"): + print(foo(inp)) # not compiled, prints -1 + +###################################################################### +# Sample raw function usage + +torch.compiler.set_stance("force_eager") +print(foo(inp)) # not compiled, prints -1 +torch.compiler.set_stance("default") + +print(foo(inp)) # compiled, prints 1 + +###################################################################### +# ``torch.compile`` stance can only be changed **outside** of any ``torch.compile`` region. Attempts +# to do otherwise will result in an error. + + +@torch.compile +def baz(x): + # error! + with torch.compiler.set_stance("force_eager"): + return x + 1 + + +try: + baz(inp) +except Exception as e: + print(e) + + +@torch.compiler.set_stance("force_eager") +def inner(x): + return x + 1 + + +@torch.compile +def outer(x): + # error! + return inner(x) + + +try: + outer(inp) +except Exception as e: + print(e) + +###################################################################### +# Other stances include: +# - ``"default"``: The default stance, used for normal compilation. +# - ``"eager_on_recompile"``: Run code eagerly when a recompile is necessary. If there is cached compiled code valid for the input, it will still be used. +# - ``"fail_on_recompile"``: Raise an error when recompiling a function. +# +# See the ``torch.compiler.set_stance`` `doc page `__ +# for more stances and options. More stances/options may also be added in the future. + +###################################################################### +# Examples +# -------- + +###################################################################### +# Preventing recompilation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Some models do not expect any recompilations - for example, you may always have inputs with the same shape. +# Since recompilations may be expensive, we may wish to error out when we attempt to recompile so we can detect and fix recompilation cases. +# The ``"fail_on_recompilation"`` stance can be used for this. + + +@torch.compile +def my_big_model(x): + return torch.relu(x) + + +# first compilation +my_big_model(torch.randn(3)) + +with torch.compiler.set_stance("fail_on_recompile"): + my_big_model(torch.randn(3)) # no recompilation - OK + try: + my_big_model(torch.randn(4)) # recompilation - error + except Exception as e: + print(e) + +###################################################################### +# If erroring out is too disruptive, we can use ``"eager_on_recompile"`` instead, +# which will cause ``torch.compile`` to fall back to eager instead of erroring out. +# This may be useful if we don't expect recompilations to happen frequently, but +# when one is required, we'd rather pay the cost of running eagerly over the cost of recompilation. + + +@torch.compile +def my_huge_model(x): + if torch.compiler.is_compiling(): + return x + 1 + else: + return x - 1 + + +# first compilation +print(my_huge_model(torch.zeros(3))) # 1 + +with torch.compiler.set_stance("eager_on_recompile"): + print(my_huge_model(torch.zeros(3))) # 1 + print(my_huge_model(torch.zeros(4))) # -1 + print(my_huge_model(torch.zeros(3))) # 1 + + +###################################################################### +# Measuring performance gains +# =========================== +# +# ``torch.compiler.set_stance`` can be used to compare eager vs. compiled performance +# without having to define a separate eager model. + + +# Returns the result of running `fn()` and the time it took for `fn()` to run, +# in seconds. We use CUDA events and synchronization for the most accurate +# measurements. +def timed(fn): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + result = fn() + end.record() + torch.cuda.synchronize() + return result, start.elapsed_time(end) / 1000 + + +@torch.compile +def my_gigantic_model(x, y): + x = x @ y + x = x @ y + x = x @ y + return x + + +inps = torch.randn(5, 5), torch.randn(5, 5) + +with torch.compiler.set_stance("force_eager"): + print("eager:", timed(lambda: my_gigantic_model(*inps))[1]) + +# warmups +for _ in range(3): + my_gigantic_model(*inps) + +print("compiled:", timed(lambda: my_gigantic_model(*inps))[1]) + + +###################################################################### +# Crashing sooner +# =============== +# +# Running an eager iteration first before a compiled iteration using the ``"force_eager"`` stance +# can help us to catch errors unrelated to ``torch.compile`` before attempting a very long compile. + + +@torch.compile +def my_humongous_model(x): + return torch.sin(x, x) + + +try: + with torch.compiler.set_stance("force_eager"): + print(my_humongous_model(torch.randn(3))) + # this call to the compiled model won't run + print(my_humongous_model(torch.randn(3))) +except Exception as e: + print(e) + +######################################## +# Conclusion +# -------------- +# In this recipe, we have learned how to use the ``torch.compiler.set_stance`` API +# to modify the behavior of ``torch.compile`` across different calls to a model +# without needing to reapply it. The recipe demonstrates using +# ``torch.compiler.set_stance`` as a decorator, context manager, or raw function +# to control compilation stances like ``force_eager``, ``default``, +# ``eager_on_recompile``, and "fail_on_recompile." +# +# For more information, see: `torch.compiler.set_stance API documentation `__. diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py new file mode 100644 index 00000000000..c0cbb7e2800 --- /dev/null +++ b/recipes_source/torch_export_aoti_python.py @@ -0,0 +1,276 @@ +# -*- coding: utf-8 -*- + +""" +.. meta:: + :description: An end-to-end example of how to use AOTInductor for Python runtime. + :keywords: torch.export, AOTInductor, torch._inductor.aoti_compile_and_package, aot_compile, torch._export.aoti_load_package + +``torch.export`` AOTInductor Tutorial for Python runtime (Beta) +=============================================================== +**Author:** Ankith Gunapal, Bin Bao, Angela Yi +""" + +###################################################################### +# +# .. warning:: +# +# ``torch._inductor.aoti_compile_and_package`` and +# ``torch._inductor.aoti_load_package`` are in Beta status and are subject +# to backwards compatibility breaking changes. This tutorial provides an +# example of how to use these APIs for model deployment using Python +# runtime. +# +# It has been shown `previously +# `__ how +# AOTInductor can be used to do Ahead-of-Time compilation of PyTorch exported +# models by creating an artifact that can be run in a non-Python environment. +# In this tutorial, you will learn an end-to-end example of how to use +# AOTInductor for Python runtime. +# +# **Contents** +# +# .. contents:: +# :local: + +###################################################################### +# Prerequisites +# ------------- +# * PyTorch 2.6 or later +# * Basic understanding of ``torch.export`` and AOTInductor +# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial + +###################################################################### +# What you will learn +# ---------------------- +# * How to use AOTInductor for Python runtime. +# * How to use :func:`torch._inductor.aoti_compile_and_package` along with :func:`torch.export.export` to generate a compiled artifact +# * How to load and run the artifact in a Python runtime using :func:`torch._export.aot_load`. +# * When to you use AOTInductor with a Python runtime + +###################################################################### +# Model Compilation +# ----------------- +# +# We will use the TorchVision pretrained ``ResNet18`` model as an example. +# +# The first step is to export the model to a graph representation using +# :func:`torch.export.export`. To learn more about using this function, you can +# check out the `docs `_ or the +# `tutorial `_. +# +# Once we have exported the PyTorch model and obtained an ``ExportedProgram``, +# we can apply :func:`torch._inductor.aoti_compile_and_package` to AOTInductor +# to compile the program to a specified device, and save the generated contents +# into a ".pt2" artifact. +# +# .. note:: +# +# This API supports the same available options that :func:`torch.compile` +# has, such as ``mode`` and ``max_autotune`` (for those who want to enable +# CUDA graphs and leverage Triton based matrix multiplications and +# convolutions) + +import os +import torch +import torch._inductor +from torchvision.models import ResNet18_Weights, resnet18 + +model = resnet18(weights=ResNet18_Weights.DEFAULT) +model.eval() + +with torch.inference_mode(): + inductor_configs = {} + + if torch.cuda.is_available(): + device = "cuda" + inductor_configs["max_autotune"] = True + else: + device = "cpu" + + model = model.to(device=device) + example_inputs = (torch.randn(2, 3, 224, 224, device=device),) + + exported_program = torch.export.export( + model, + example_inputs, + ) + path = torch._inductor.aoti_compile_and_package( + exported_program, + package_path=os.path.join(os.getcwd(), "resnet18.pt2"), + inductor_configs=inductor_configs + ) + +###################################################################### +# The result of :func:`aoti_compile_and_package` is an artifact "resnet18.pt2" +# which can be loaded and executed in Python and C++. +# +# The artifact itself contains a bunch of AOTInductor generated code, such as +# a generated C++ runner file, a shared library compiled from the C++ file, and +# CUDA binary files, aka cubin files, if optimizing for CUDA. +# +# Structure-wise, the artifact is a structured ``.zip`` file, with the following +# specification: +# +# .. code:: +# . +# ├── archive_format +# ├── version +# ├── data +# │ ├── aotinductor +# │ │ └── model +# │ │ ├── xxx.cpp # AOTInductor generated cpp file +# │ │ ├── xxx.so # AOTInductor generated shared library +# │ │ ├── xxx.cubin # Cubin files (if running on CUDA) +# │ │ └── xxx_metadata.json # Additional metadata to save +# │ ├── weights +# │ │ └── TBD +# │ └── constants +# │ └── TBD +# └── extra +# └── metadata.json +# +# We can use the following command to inspect the artifact contents: +# +# .. code:: bash +# +# $ unzip -l resnet18.pt2 +# +# .. code:: +# +# Archive: resnet18.pt2 +# Length Date Time Name +# --------- ---------- ----- ---- +# 1 01-08-2025 16:40 version +# 3 01-08-2025 16:40 archive_format +# 10088 01-08-2025 16:40 data/aotinductor/model/cagzt6akdaczvxwtbvqe34otfe5jlorktbqlojbzqjqvbfsjlge4.cubin +# 17160 01-08-2025 16:40 data/aotinductor/model/c6oytfjmt5w4c7onvtm6fray7clirxt7q5xjbwx3hdydclmwoujz.cubin +# 16616 01-08-2025 16:40 data/aotinductor/model/c7ydp7nocyz323hij4tmlf2kcedmwlyg6r57gaqzcsy3huneamu6.cubin +# 17776 01-08-2025 16:40 data/aotinductor/model/cyqdf46ordevqhiddvpdpp3uzwatfbzdpl3auj2nx23uxvplnne2.cubin +# 10856 01-08-2025 16:40 data/aotinductor/model/cpzfebfgrusqslui7fxsuoo4tvwulmrxirc5tmrpa4mvrbdno7kn.cubin +# 14608 01-08-2025 16:40 data/aotinductor/model/c5ukeoz5wmaszd7vczdz2qhtt6n7tdbl3b6wuy4rb2se24fjwfoy.cubin +# 11376 01-08-2025 16:40 data/aotinductor/model/csu3nstcp56tsjfycygaqsewpu64l5s6zavvz7537cm4s4cv2k3r.cubin +# 10984 01-08-2025 16:40 data/aotinductor/model/cp76lez4glmgq7gedf2u25zvvv6rksv5lav4q22dibd2zicbgwj3.cubin +# 14736 01-08-2025 16:40 data/aotinductor/model/c2bb5p6tnwz4elgujqelsrp3unvkgsyiv7xqxmpvuxcm4jfl7pc2.cubin +# 11376 01-08-2025 16:40 data/aotinductor/model/c6eopmb2b4ngodwsayae4r5q6ni3jlfogfbdk3ypg56tgpzhubfy.cubin +# 11624 01-08-2025 16:40 data/aotinductor/model/chmwe6lvoekzfowdbiizitm3haiiuad5kdm6sd2m6mv6dkn2zk32.cubin +# 15632 01-08-2025 16:40 data/aotinductor/model/c3jop5g344hj3ztsu4qm6ibxyaaerlhkzh2e6emak23rxfje6jam.cubin +# 25472 01-08-2025 16:40 data/aotinductor/model/chaiixybeiuuitm2nmqnxzijzwgnn2n7uuss4qmsupgblfh3h5hk.cubin +# 139389 01-08-2025 16:40 data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.cpp +# 27 01-08-2025 16:40 data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t_metadata.json +# 47195424 01-08-2025 16:40 data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.so +# --------- ------- +# 47523148 18 files + + +###################################################################### +# Model Inference in Python +# ------------------------- +# +# To load and run the artifact in Python, we can use :func:`torch._inductor.aoti_load_package`. +# + +import os +import torch +import torch._inductor + +model_path = os.path.join(os.getcwd(), "resnet18.pt2") + +compiled_model = torch._inductor.aoti_load_package(model_path) +example_inputs = (torch.randn(2, 3, 224, 224, device=device),) + +with torch.inference_mode(): + output = compiled_model(example_inputs) + + +###################################################################### +# When to use AOTInductor with a Python Runtime +# --------------------------------------------- +# +# There are mainly two reasons why one would use AOTInductor with a Python Runtime: +# +# - ``torch._inductor.aoti_compile_and_package`` generates a singular +# serialized artifact. This is useful for model versioning for deployments +# and tracking model performance over time. +# - With :func:`torch.compile` being a JIT compiler, there is a warmup +# cost associated with the first compilation. Your deployment needs to +# account for the compilation time taken for the first inference. With +# AOTInductor, the compilation is done ahead of time using +# ``torch.export.export`` and ``torch._inductor.aoti_compile_and_package``. +# At deployment time, after loading the model, running inference does not +# have any additional cost. +# +# +# The section below shows the speedup achieved with AOTInductor for first inference +# +# We define a utility function ``timed`` to measure the time taken for inference +# + +import time +def timed(fn): + # Returns the result of running `fn()` and the time it took for `fn()` to run, + # in seconds. We use CUDA events and synchronization for accurate + # measurement on CUDA enabled devices. + if torch.cuda.is_available(): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + else: + start = time.time() + + result = fn() + if torch.cuda.is_available(): + end.record() + torch.cuda.synchronize() + else: + end = time.time() + + # Measure time taken to execute the function in miliseconds + if torch.cuda.is_available(): + duration = start.elapsed_time(end) + else: + duration = (end - start) * 1000 + + return result, duration + + +###################################################################### +# Lets measure the time for first inference using AOTInductor + +torch._dynamo.reset() + +model = torch._inductor.aoti_load_package(model_path) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms") + + +###################################################################### +# Lets measure the time for first inference using ``torch.compile`` + +torch._dynamo.reset() + +model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device) +model.eval() + +model = torch.compile(model) +example_inputs = torch.randn(1, 3, 224, 224, device=device) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms") + +###################################################################### +# We see that there is a drastic speedup in first inference time using AOTInductor compared +# to ``torch.compile`` + +###################################################################### +# Conclusion +# ---------- +# +# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by +# compiling and loading a pretrained ``ResNet18`` model. This process +# demonstrates the practical application of generating a compiled artifact and +# running it within a Python environment. We also looked at the advantage of using +# AOTInductor in model deployments, with regards to speed up in first inference time. diff --git a/recipes_source/torch_export_challenges_solutions.rst b/recipes_source/torch_export_challenges_solutions.rst new file mode 100644 index 00000000000..1f8b1ae45a4 --- /dev/null +++ b/recipes_source/torch_export_challenges_solutions.rst @@ -0,0 +1,331 @@ +Demonstration of torch.export flow, common challenges and the solutions to address them +======================================================================================= +**Authors:** `Ankith Gunapal `__, `Jordi Ramon `__, `Marcos Carranza `__ + +In the `Introduction to torch.export Tutorial `__ , we learned how to use `torch.export `__. +This tutorial expands on the previous one and explores the process of exporting popular models with code, as well as addresses common challenges that may arise with ``torch.export``. + +In this tutorial, you will learn how to export models for these use cases: + +* Video classifier (`MViT `__) +* Automatic Speech Recognition (`OpenAI Whisper-Tiny `__) +* Image Captioning (`BLIP `__) +* Promptable Image Segmentation (`SAM2 `__) + +Each of the four models were chosen to demonstrate unique features of `torch.export`, as well as some practical considerations +and issues faced in the implementation. + +Prerequisites +------------- + +* PyTorch 2.4 or later +* Basic understanding of ``torch.export`` and PyTorch Eager inference. + + +Key requirement for ``torch.export``: No graph break +---------------------------------------------------- + +`torch.compile `__ speeds up PyTorch code by using JIT to compile PyTorch code into optimized kernels. It optimizes the given model +using ``TorchDynamo`` and creates an optimized graph , which is then lowered into the hardware using the backend specified in the API. +When TorchDynamo encounters unsupported Python features, it breaks the computation graph, lets the default Python interpreter +handle the unsupported code, and then resumes capturing the graph. This break in the computation graph is called a `graph break `__. + +One of the key differences between ``torch.export`` and ``torch.compile`` is that ``torch.export`` doesn’t support graph breaks +which means that the entire model or part of the model that you are exporting needs to be a single graph. This is because handling graph breaks +involves interpreting the unsupported operation with default Python evaluation, which is incompatible with what ``torch.export`` is +designed for. You can read details about the differences between the various PyTorch frameworks in this `link `__ + +You can identify graph breaks in your program by using the following command: + +.. code:: sh + + TORCH_LOGS="graph_breaks" python .py + +You will need to modify your program to get rid of graph breaks. Once resolved, you are ready to export the model. +PyTorch runs `nightly benchmarks `__ for `torch.compile` on popular HuggingFace and TIMM models. +Most of these models have no graph breaks. + +The models in this recipe have no graph breaks, but fail with `torch.export`. + +Video Classification +-------------------- + +MViT is a class of models based on `MultiScale Vision Transformers `__. This model has been trained for video classification using the `Kinetics-400 Dataset `__. +This model with a relevant dataset can be used for action recognition in the context of gaming. + + +The code below exports MViT by tracing with ``batch_size=2`` and then checks if the ExportedProgram can run with ``batch_size=4``. + +.. code:: python + + import numpy as np + import torch + from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b + import traceback as tb + + model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT) + + # Create a batch of 2 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(2,16, 224, 224, 3) + # Transpose to get [1, 3, num_clips, height, width]. + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + + # Export the model. + exported_program = torch.export.export( + model, + (input_frames,), + ) + + # Create a batch of 4 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(4,16, 224, 224, 3) + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + try: + exported_program.module()(input_frames) + except Exception: + tb.print_exc() + + +Error: Static batch size +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sh + + raise RuntimeError( + RuntimeError: Expected input at *args[0].shape[0] to be equal to 2, but got 4 + + +By default, the exporting flow will trace the program assuming that all input shapes are static, so if you run the program with +input shapes that are different than the ones you used while tracing, you will run into an error. + +Solution +~~~~~~~~ + +To address the error, we specify the first dimension of the input (``batch_size``) to be dynamic , specifying the expected range of ``batch_size``. +In the corrected example shown below, we specify that the expected ``batch_size`` can range from 1 to 16. +One detail to notice that ``min=2`` is not a bug and is explained in `The 0/1 Specialization Problem `__. A detailed description of dynamic shapes +for ``torch.export`` can be found in the export tutorial. The code shown below demonstrates how to export mViT with dynamic batch sizes: + +.. code:: python + + import numpy as np + import torch + from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b + import traceback as tb + + + model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT) + + # Create a batch of 2 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(2,16, 224, 224, 3) + + # Transpose to get [1, 3, num_clips, height, width]. + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + + # Export the model. + batch_dim = torch.export.Dim("batch", min=2, max=16) + exported_program = torch.export.export( + model, + (input_frames,), + # Specify the first dimension of the input x as dynamic + dynamic_shapes={"x": {0: batch_dim}}, + ) + + # Create a batch of 4 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(4,16, 224, 224, 3) + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + try: + exported_program.module()(input_frames) + except Exception: + tb.print_exc() + + +Automatic Speech Recognition +--------------- + +**Automatic Speech Recognition** (ASR) is the use of machine learning to transcribe spoken language into text. +`Whisper `__ is a Transformer based encoder-decoder model from OpenAI, which was trained on 680k hours of labelled data for ASR and speech translation. +The code below tries to export ``whisper-tiny`` model for ASR. + + +.. code:: python + + import torch + from transformers import WhisperProcessor, WhisperForConditionalGeneration + from datasets import load_dataset + + # load model + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + + # dummy inputs for exporting the model + input_features = torch.randn(1,80, 3000) + attention_mask = torch.ones(1, 3000) + decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id + + model.eval() + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,)) + + + +Error: strict tracing with TorchDynamo +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: console + + torch._dynamo.exc.InternalTorchDynamoError: AttributeError: 'DynamicCache' object has no attribute 'key_cache' + + +By default ``torch.export`` traces your code using `TorchDynamo `__, a byte-code analysis engine, which symbolically analyzes your code and builds a graph. +This analysis provides a stronger guarantee about safety but not all Python code is supported. When we export the ``whisper-tiny`` model using the +default strict mode, it typically returns an error in Dynamo due to an unsupported feature. To understand why this errors in Dynamo, you can refer to this `GitHub issue `__. + +Solution +~~~~~~~~ + +To address the above error , ``torch.export`` supports the ``non_strict`` mode where the program is traced using the Python interpreter, which works similar to +PyTorch eager execution. The only difference is that all ``Tensor`` objects will be replaced by ``ProxyTensors``, which will record all their operations into +a graph. By using ``strict=False``, we are able to export the program. + +.. code:: python + + import torch + from transformers import WhisperProcessor, WhisperForConditionalGeneration + from datasets import load_dataset + + # load model + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + + # dummy inputs for exporting the model + input_features = torch.randn(1,80, 3000) + attention_mask = torch.ones(1, 3000) + decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id + + model.eval() + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,), strict=False) + +Image Captioning +---------------- + +**Image Captioning** is the task of defining the contents of an image in words. In the context of gaming, Image Captioning can be used to enhance the +gameplay experience by dynamically generating text description of the various game objects in the scene, thereby providing the gamer with additional +details. `BLIP `__ is a popular model for Image Captioning `released by SalesForce Research `__. The code below tries to export BLIP with ``batch_size=1``. + + +.. code:: python + + import torch + from models.blip import blip_decoder + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + image_size = 384 + image = torch.randn(1, 3,384,384).to(device) + caption_input = "" + + model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth' + model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base') + model.eval() + model = model.to(device) + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(image,caption_input,), strict=False) + + + +Error: Cannot mutate tensors with frozen storage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While exporting a model, it might fail because the model implementation might contain certain Python operations which are not yet supported by ``torch.export``. +Some of these failures may have a workaround. BLIP is an example where the original model errors, which can be resolved by making a small change in the code. +``torch.export`` lists the common cases of supported and unsupported operations in `ExportDB `__ and shows how you can modify your code to make it export compatible. + +.. code:: console + + File "/BLIP/models/blip.py", line 112, in forward + text.input_ids[:,0] = self.tokenizer.bos_token_id + File "/anaconda3/envs/export/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py", line 545, in __torch_dispatch__ + outs_unwrapped = func._op_dk( + RuntimeError: cannot mutate tensors with frozen storage + + + +Solution +~~~~~~~~ + +Clone the `tensor `__ where export fails. + +.. code:: python + + text.input_ids = text.input_ids.clone() # clone the tensor + text.input_ids[:,0] = self.tokenizer.bos_token_id + +.. note:: + This constraint has been relaxed in PyTorch 2.7 nightlies. This should work out-of-the-box in PyTorch 2.7 + +Promptable Image Segmentation +----------------------------- + +**Image segmentation** is a computer vision technique that divides a digital image into distinct groups of pixels, or segments, based on their characteristics. +`Segment Anything Model (SAM) `__) introduced promptable image segmentation, which predicts object masks given prompts that indicate the desired object. `SAM 2 `__ is +the first unified model for segmenting objects across images and videos. The `SAM2ImagePredictor `__ class provides an easy interface to the model for prompting +the model. The model can take as input both point and box prompts, as well as masks from the previous iteration of prediction. Since SAM2 provides strong +zero-shot performance for object tracking, it can be used for tracking game objects in a scene. + + +The tensor operations in the predict method of `SAM2ImagePredictor `__ are happening in the `_predict `__ method. So, we try to export like this. + +.. code:: python + + ep = torch.export.export( + self._predict, + args=(unnorm_coords, labels, unnorm_box, mask_input, multimask_output), + kwargs={"return_logits": return_logits}, + strict=False, + ) + + +Error: Model is not of type ``torch.nn.Module`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``torch.export`` expects the module to be of type ``torch.nn.Module``. However, the module we are trying to export is a class method. Hence it errors. + +.. code:: console + + Traceback (most recent call last): + File "/sam2/image_predict.py", line 20, in + masks, scores, _ = predictor.predict( + File "/sam2/sam2/sam2_image_predictor.py", line 312, in predict + ep = torch.export.export( + File "python3.10/site-packages/torch/export/__init__.py", line 359, in export + raise ValueError( + ValueError: Expected `mod` to be an instance of `torch.nn.Module`, got . + + +Solution +~~~~~~~~ + +We write a helper class, which inherits from ``torch.nn.Module`` and call the ``_predict method`` in the ``forward`` method of the class. The complete code can be found `here `__. + +.. code:: python + + class ExportHelper(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(_, *args, **kwargs): + return self._predict(*args, **kwargs) + + model_to_export = ExportHelper() + ep = torch.export.export( + model_to_export, + args=(unnorm_coords, labels, unnorm_box, mask_input, multimask_output), + kwargs={"return_logits": return_logits}, + strict=False, + ) + +Conclusion +---------- + +In this tutorial, we have learned how to use ``torch.export`` to export models for popular use cases by addressing challenges through correct configuration and simple code modifications. +Once you are able to export a model, you can lower the ``ExportedProgram`` into your hardware using `AOTInductor `__ in case of servers and `ExecuTorch `__ in case of edge device. +To learn more about ``AOTInductor`` (AOTI), please refer to the `AOTI tutorial `__. +To learn more about ``ExecuTorch`` , please refer to the `ExecuTorch tutorial `__. diff --git a/recipes_source/torchscript_inference.rst b/recipes_source/torchscript_inference.rst index 54068e70723..01bc497d38e 100644 --- a/recipes_source/torchscript_inference.rst +++ b/recipes_source/torchscript_inference.rst @@ -1,197 +1,6 @@ -TorchScript for Deployment -========================== +.. + TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456 -In this recipe, you will learn: - -- What TorchScript is -- How to export your trained model in TorchScript format -- How to load your TorchScript model in C++ and do inference - -Requirements ------------- - -- PyTorch 1.5 -- TorchVision 0.6.0 -- libtorch 1.5 -- C++ compiler - -The instructions for installing the three PyTorch components are -available at `pytorch.org`_. The C++ compiler will depend on your -platform. - -What is TorchScript? --------------------- - -**TorchScript** is an intermediate representation of a PyTorch model -(subclass of ``nn.Module``) that can then be run in a high-performance -environment like C++. It’s a high-performance subset of Python that is -meant to be consumed by the **PyTorch JIT Compiler,** which performs -run-time optimization on your model’s computation. TorchScript is the -recommended model format for doing scaled inference with PyTorch models. -For more information, see the PyTorch `Introduction to TorchScript -tutorial`_, the `Loading A TorchScript Model in C++ tutorial`_, and the -`full TorchScript documentation`_, all of which are available on -`pytorch.org`_. - -How to Export Your Model ------------------------- - -As an example, let’s take a pretrained vision model. All of the -pretrained models in TorchVision are compatible with TorchScript. - -Run the following Python 3 code, either in a script or from the REPL: - -.. code:: python3 - - import torch - import torch.nn.functional as F - import torchvision.models as models - - r18 = models.resnet18(pretrained=True) # We now have an instance of the pretrained model - r18_scripted = torch.jit.script(r18) # *** This is the TorchScript export - dummy_input = torch.rand(1, 3, 224, 224) # We should run a quick test - -Let’s do a sanity check on the equivalence of the two models: - -:: - - unscripted_output = r18(dummy_input) # Get the unscripted model's prediction... - scripted_output = r18_scripted(dummy_input) # ...and do the same for the scripted version - - unscripted_top5 = F.softmax(unscripted_output, dim=1).topk(5).indices - scripted_top5 = F.softmax(scripted_output, dim=1).topk(5).indices - - print('Python model top 5 results:\n {}'.format(unscripted_top5)) - print('TorchScript model top 5 results:\n {}'.format(scripted_top5)) - -You should see that both versions of the model give the same results: - -:: - - Python model top 5 results: - tensor([[463, 600, 731, 899, 898]]) - TorchScript model top 5 results: - tensor([[463, 600, 731, 899, 898]]) - -With that check confirmed, go ahead and save the model: - -:: - - r18_scripted.save('r18_scripted.pt') - -Loading TorchScript Models in C++ ---------------------------------- - -Create the following C++ file and name it ``ts-infer.cpp``: - -.. code:: cpp - - #include - #include - - - int main(int argc, const char* argv[]) { - if (argc != 2) { - std::cerr << "usage: ts-infer \n"; - return -1; - } - - std::cout << "Loading model...\n"; - - // deserialize ScriptModule - torch::jit::script::Module module; - try { - module = torch::jit::load(argv[1]); - } catch (const c10::Error& e) { - std::cerr << "Error loading model\n"; - std::cerr << e.msg_without_backtrace(); - return -1; - } - - std::cout << "Model loaded successfully\n"; - - torch::NoGradGuard no_grad; // ensures that autograd is off - module.eval(); // turn off dropout and other training-time layers/functions - - // create an input "image" - std::vector inputs; - inputs.push_back(torch::rand({1, 3, 224, 224})); - - // execute model and package output as tensor - at::Tensor output = module.forward(inputs).toTensor(); - - namespace F = torch::nn::functional; - at::Tensor output_sm = F::softmax(output, F::SoftmaxFuncOptions(1)); - std::tuple top5_tensor = output_sm.topk(5); - at::Tensor top5 = std::get<1>(top5_tensor); - - std::cout << top5[0] << "\n"; - - std::cout << "\nDONE\n"; - return 0; - } - -This program: - -- Loads the model you specify on the command line -- Creates a dummy “image” input tensor -- Performs inference on the input - -Also, notice that there is no dependency on TorchVision in this code. -The saved version of your TorchScript model has your learning weights -*and* your computation graph - nothing else is needed. - -Building and Running Your C++ Inference Engine ----------------------------------------------- - -Create the following ``CMakeLists.txt`` file: - -:: - - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(custom_ops) - - find_package(Torch REQUIRED) - - add_executable(ts-infer ts-infer.cpp) - target_link_libraries(ts-infer "${TORCH_LIBRARIES}") - set_property(TARGET ts-infer PROPERTY CXX_STANDARD 11) - -Make the program: - -:: - - cmake -DCMAKE_PREFIX_PATH= - make - -Now, we can run inference in C++, and verify that we get a result: - -:: - - $ ./ts-infer r18_scripted.pt - Loading model... - Model loaded successfully - 418 - 845 - 111 - 892 - 644 - [ CPULongType{5} ] - - DONE - -Important Resources -------------------- - -- `pytorch.org`_ for installation instructions, and more documentation - and tutorials. -- `Introduction to TorchScript tutorial`_ for a deeper initial - exposition of TorchScript -- `Full TorchScript documentation`_ for complete TorchScript language - and API reference - -.. _pytorch.org: https://pytorch.org/ -.. _Introduction to TorchScript tutorial: https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html -.. _Full TorchScript documentation: https://pytorch.org/docs/stable/jit.html -.. _Loading A TorchScript Model in C++ tutorial: https://pytorch.org/tutorials/advanced/cpp_export.html -.. _full TorchScript documentation: https://pytorch.org/docs/stable/jit.html +.. warning:: + TorchScript is deprecated, please use + `torch.export `__ instead. \ No newline at end of file diff --git a/recipes_source/torchserve_vertexai_tutorial.rst b/recipes_source/torchserve_vertexai_tutorial.rst deleted file mode 100644 index 9c748e7b8c1..00000000000 --- a/recipes_source/torchserve_vertexai_tutorial.rst +++ /dev/null @@ -1,144 +0,0 @@ -Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint -================================================================== - -Deploying large models, like Stable Diffusion, can be challenging and time-consuming. - -In this recipe, we will show how you can streamline the deployment of a PyTorch Stable Diffusion -model by leveraging Vertex AI. - -PyTorch is the framework used by Stability AI on Stable -Diffusion v1.5. Vertex AI is a fully-managed machine learning platform with tools and -infrastructure designed to help ML practitioners accelerate and scale ML in production with -the benefit of open-source frameworks like PyTorch. - -In four steps you can deploy a PyTorch Stable Diffusion model (v1.5). - -Deploying your Stable Diffusion model on a Vertex AI Endpoint can be done in four steps: - -* Create a custom TorchServe handler. - -* Upload model artifacts to Google Cloud Storage (GCS). - -* Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image. - -* Deploy the Vertex AI model onto an endpoint. - -Let’s have a look at each step in more detail. You can follow and implement the steps using the -`Notebook example `__. - -NOTE: Please keep in mind that this recipe requires a billable Vertex AI as explained in more details in the notebook example. - -Create a custom TorchServe handler ----------------------------------- - -TorchServe is an easy and flexible tool for serving PyTorch models. The model deployed to Vertex AI -uses TorchServe to handle requests and return responses from the model. -You must create a custom TorchServe handler to include in the model artifacts uploaded to Vertex AI. Include the handler file in the -directory with the other model artifacts, like this: `model_artifacts/handler.py`. - -After creating the handler file, you must package the handler as a model archiver (MAR) file. -The output file must be named `model.mar`. - - -.. code:: shell - - !torch-model-archiver \ - -f \ - --model-name \ - --version 1.0 \ - --handler model_artifacts/handler.py \ - --export-path model_artifacts - -Upload model artifacts to Google Cloud Storage (GCS) ----------------------------------------------------- - -In this step we are uploading -`model artifacts `__ -to GCS, like the model file or handler. The advantage of storing your artifacts on GCS is that you can -track the artifacts in a central bucket. - - -.. code:: shell - - BUCKET_NAME = "your-bucket-name-unique" # @param {type:"string"} - BUCKET_URI = f"gs://{BUCKET_NAME}/" - - # Will copy the artifacts into the bucket - !gsutil cp -r model_artifacts $BUCKET_URI - -Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image ----------------------------------------------------------------------------------------- - -Once you've uploaded the model artifacts into a GCS bucket, you can upload your PyTorch model to -`Vertex AI Model Registry `__. -From the Vertex AI Model Registry, you have an overview of your models -so you can better organize, track, and train new versions. For this you can use the -`Vertex AI SDK `__ -and this -`pre-built PyTorch container `__. - - -.. code:: shell - - from google.cloud import aiplatform as vertexai - PYTORCH_PREDICTION_IMAGE_URI = ( - "us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.1-12:latest" - ) - MODEL_DISPLAY_NAME = "stable_diffusion_1_5-unique" - MODEL_DESCRIPTION = "stable_diffusion_1_5 container" - - vertexai.init(project='your_project', location='us-central1', staging_bucket=BUCKET_NAME) - - model = aiplatform.Model.upload( - display_name=MODEL_DISPLAY_NAME, - description=MODEL_DESCRIPTION, - serving_container_image_uri=PYTORCH_PREDICTION_IMAGE_URI, - artifact_uri=BUCKET_URI, - ) - -Deploy the Vertex AI model onto an endpoint -------------------------------------------- - -Once the model has been uploaded to Vertex AI Model Registry you can then take it and deploy -it to an Vertex AI Endpoint. For this you can use the Console or the Vertex AI SDK. In this -example you will deploy the model on a NVIDIA Tesla P100 GPU and n1-standard-8 machine. You can -specify your machine type. - - -.. code:: shell - - endpoint = aiplatform.Endpoint.create(display_name=ENDPOINT_DISPLAY_NAME) - - model.deploy( - endpoint=endpoint, - deployed_model_display_name=MODEL_DISPLAY_NAME, - machine_type="n1-standard-8", - accelerator_type="NVIDIA_TESLA_P100", - accelerator_count=1, - traffic_percentage=100, - deploy_request_timeout=1200, - sync=True, - ) - -If you follow this -`notebook `__ -you can also get online predictions using the Vertex AI SDK as shown in the following snippet. - - -.. code:: shell - - instances = [{"prompt": "An examplePup dog with a baseball jersey."}] - response = endpoint.predict(instances=instances) - - with open("img.jpg", "wb") as g: - g.write(base64.b64decode(response.predictions[0])) - - display.Image("img.jpg") - -Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image - -More resources --------------- - -This tutorial was created using the vendor documentation. To refer to the original documentation on the vendor site, please see -`torchserve example `__. diff --git a/recipes_source/xeon_run_cpu.rst b/recipes_source/xeon_run_cpu.rst index fcf96a2ee84..9ff14be08e3 100644 --- a/recipes_source/xeon_run_cpu.rst +++ b/recipes_source/xeon_run_cpu.rst @@ -1,4 +1,4 @@ -Optimizing PyTorch Inference with Intel® Xeon® Scalable Processors +Optimizing CPU Performance on Intel® Xeon® with run_cpu Script ====================================================================== There are several configuration options that can impact the performance of PyTorch inference when executed on Intel® Xeon® Scalable Processors. @@ -361,4 +361,3 @@ See also: * `PyTorch Performance Tuning Guide `__ * `PyTorch Multiprocessing Best Practices `__ -* Grokking PyTorch Intel CPU performance: `Part 1 `__ `Part 2 `__ diff --git a/redirects.py b/redirects.py new file mode 100644 index 00000000000..dd639976ebd --- /dev/null +++ b/redirects.py @@ -0,0 +1,55 @@ +redirects = { + "advanced/cpp_extension.html": "https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html", + "advanced/cpp_cuda_graphs.html": "../index.html", + "advanced/dynamic_quantization_tutorial.html": "../index.html", + "advanced/static_quantization_tutorial.html": "../index.html", + "advanced/super_resolution_with_onnxruntime.html": "../index.html", + "advanced/torch_script_custom_classes": "../index.html", + "advanced_source/static_quantization_tutorial.rst": "../index.html", + "beginner/Intro_to_TorchScript_tutorial.html": "../index.html", + "beginner/deploy_seq2seq_hybrid_frontend_tutorial.html": "../index.html", + "beginner/hybrid_frontend_tutorial": "../index.html", + "beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.html": "../../index.html", + "beginner/flava_finetuning_tutorial.html": "../index.html", + "beginner/ptcheat.html": "../index.html", + "beginner/audio_io_tutorial.html": "../index.html", + "intermediate/FSDP_adavnced_tutorial.html": "https://docs.pytorch.org/tutorials/intermediate/FSDP_advanced_tutorial.html", + "intermediate/dynamic_quantization_bert_tutorial.html": "../index.html", + "intermediate/flask_rest_api_tutorial": "../index.html", + "intermediate/quantized_transfer_learning_tutorial.html": "../index.html", + "intermediate/torchserve_with_ipex.html": "../index.html", + "intermediate/torchserve_with_ipex_2.html": "../index.html", + "intermediate/tiatoolbox_tutorial.html": "../index.html", + "prototype/backend_config_tutorial.html": "../index.html", + "prototype/flight_recorder_tutorial.html": "https://docs.pytorch.org/tutorials/unstable/flight_recorder_tutorial.html", + "prototype/fx_graph_mode_ptq_dynamic.html": "../index.html", + "prototype/fx_graph_mode_ptq_static.html": "../index.html", + "prototype/fx_graph_mode_quant_guide.html": "../index.html", + "prototype/graph_mode_dynamic_bert_tutorial.html": "../index.html", + "prototype/inductor_windows.html": "https://pytorch.org/tutorials/unstable/inductor_windows.html", + "prototype/numeric_suite_tutorial.html": "../index.html", + "prototype/openvino_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_openvino_inductor.html", + "prototype/pt2e_quant_ptq.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_ptq.html", + "prototype/pt2e_quant_ptq_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html", + "prototype/pt2e_quant_qat.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_qat.html", + "prototype/pt2e_quant_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html", + "prototype/pt2e_quant_xpu_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_xpu_inductor.html", + "prototype/pt2e_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quantizer.html", + "prototype/quantization_in_pytorch_2_0_export_tutorial.html": "../index.html", + "prototype/torchscript_freezing.html": "../index.html", + "recipes_source/cuda_rpc.rst": "../index.html", + "receipes/fuse.html": "../index.html", + "receipes/quantization.html": "../index.html", + "receipes/receipes/dynamic_quantization.html": "../index.html", + "recipes/bundled_inputs.html": "../index.html", + "recipes/inference_tuning_on_aws_graviton.html": "../index.html", + "recipes/recipes_index.html": "../recipes_index.html", + "recipes/intel_extension_for_pytorch.html": "../index.html", + "recipes/torch_compile_backend_ipex.html": "../index.html", + "recipes/torchserve_vertexai_tutorial.html": "../index.html", + "recipes/amx.html": "../index.html", + "unstable_source/vulkan_workflow.rst": "../index.html", + "unstable/semi_structured_sparse.html": "https://docs.pytorch.org/tutorials/advanced/semi_structured_sparse.html", + "unstable/skip_param_init.html": "https://docs.pytorch.org/tutorials/recipes/recipes/module_load_state_dict_tips.html", + "unstable_source/backend_config_tutorial.rst": "../index.html", +} diff --git a/tools/linter/adapters/run_from_link.py b/tools/linter/adapters/run_from_link.py new file mode 100644 index 00000000000..57c2f89f9a5 --- /dev/null +++ b/tools/linter/adapters/run_from_link.py @@ -0,0 +1,81 @@ +import argparse +import subprocess +import urllib.request +from pathlib import Path + + +REPO_ROOT = Path(__file__).absolute().parents[3] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Use a formatter in a different repository.", + ) + parser.add_argument( + "--run-init", + action="store_true", + help="Run the initialization script specified by --init-name.", + ) + parser.add_argument( + "--run-lint", + action="store_true", + help="Run the linting script specified by --lint-name.", + ) + parser.add_argument( + "--init-name", + help="Name of the initialization script. This also serves as the filename.", + ) + parser.add_argument( + "--init-link", + help="URL to download the initialization script from.", + ) + parser.add_argument( + "--lint-name", + help="Name of the linting script. This also serves as the filename.", + ) + parser.add_argument( + "--lint-link", + help="URL to download the linting script from.", + ) + + parser.add_argument("args_for_file", nargs=argparse.REMAINDER) + args = parser.parse_args() + # Skip the first -- if present + if args.args_for_file and args.args_for_file[0] == "--": + args.args_for_file = args.args_for_file[1:] + return args + + +def download_file(url: str, location: Path) -> bytes: + response = urllib.request.urlopen(url) + content = response.read() + location.write_bytes(content) + return content + + +def main() -> None: + args = parse_args() + + location = REPO_ROOT / ".lintbin" / "from_link" / "adapters" + location.mkdir(parents=True, exist_ok=True) + + if args.lint_link: + download_file(args.lint_link, location / args.lint_name) + + if args.init_link: + download_file(args.init_link, location / args.init_name) + + if args.run_init: + # Save the content to a file named after the name argument + subprocess_args = ["python3", location / args.init_name] + args.args_for_file + subprocess.run(subprocess_args, check=True) + if args.run_lint: + subprocess_args = ["python3", location / args.lint_name] + args.args_for_file + subprocess.run( + subprocess_args, + check=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tutorial_submission_policy.md b/tutorial_submission_policy.md new file mode 100644 index 00000000000..4717ec438d4 --- /dev/null +++ b/tutorial_submission_policy.md @@ -0,0 +1,106 @@ +# PyTorch Tutorial Submission Policy + +This policy outlines the criteria and process for submitting new +tutorials to the PyTorch community. +Our goal is to ensure that all tutorials are of high quality, +relevant, and up-to-date, supporting both the growth of the PyTorch +users and the evolution of the PyTorch framework itself. By following +these guidelines, contributors can help us maintain a robust and +informative educational environment. + +## Acceptance Criteria For New Tutorials + +We accept new tutorials that adhere to one of the following use cases: + +* **Demonstrate New PyTorch Features:** Tutorials that support new features + for upcoming PyTorch releases are typically authored by the engineers who + are developing these features. These tutorials are crucial for showcasing + the latest advancements in PyTorch. We typically do not require more than + one tutorial per feature. + +* **Tutorials showcasing PyTorch usage with other tools and libraries:** We + accept community-contributed tutorials that illustrate innovative uses of + PyTorch alongside other open-source projects, models, and tools. Please + ensure that your tutorial remains neutral and does not promote or endorse + proprietary technologies over others. + +The first use case does not require going through the submission +process outlined below. If your tutorial falls under the second category, +please read and follow the instructions in the +**Submission Process For Community-Contributed Tutorials** section. + +## Submission Process For Community-Contributed Tutorials + +To maintain the quality and relevance of tutorials, we request that +community-contributed tutorials undergo a review process. If you are +interested in contributing a tutorial, please follow these steps: + +1. **Create an issue:** + * Open an issue in the pytorch/tutorials repository proposing the + new tutorial. Clearly explain the importance of the tutorial and + confirm that there is no existing tutorial covering the same or + similar topic. A tutorial should not disproportionately endorse + one technology over another. Please consult with Core Maintainers + to ensure your content adheres to these guidelines. + Use the provided [ISSUE_TEMPLATE](https://github.com/pytorch/tutorials/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml) for the new tutorial request - select **Feature request** when submitting an issue. + + * If there is an existing tutorial on the topic that you would + like to significantly refactor, you can submit a PR. In the + description of the PR, explain why the changes are needed and + how they improve the tutorial. + + * These issues will be triaged by PyTorch maintainers on a case-by-case basis. + * Link any supporting materials including discussions in other repositories. + +1. **Await Approval:** + * Wait for a response from the PyTorch Tutorials maintainers. A PyTorch + tutorial maintainer will review your proposal and + determine whether a tutorial on the proposed topic is desirable. + A comment and an **approved** label will be added to your issue + by a maintainer. The review process for new tutorial PRs submitted + without the corresponding issue may take longer. + +1. **Adhere to writing and styling guidelines:** + * Once approved, follow the guidelines outlined in [CONTRIBUTING.md](https://github.com/pytorch/tutorials/blob/main/CONTRIBUTING.md) + and use the provided [template](https://github.com/pytorch/tutorials/blob/main/beginner_source/template_tutorial.py) for creating your tutorial. + * Link the issue in which you received approval for your tutorial + in the PR. + * We accept tutorials in both ``.rst`` (ReStructuredText) and ``.py`` + (Python) formats. However, unless your tutorial involves using + multiple GPU, parallel/distributed training, or requires extended + execution time (25 minutes or more), we prefer submissions + in Python file format. + +## Maintaining Tutorials + +When you submit a new tutorial, we encourage you to keep it in sync +with the latest PyTorch updates and features. Additionally, we may +contact you to review any PRs, issues, and other related matters to +ensure the tutorial remains a valuable resource. + +Please note the following: + +* If a tutorial breaks against the main branch, it will + be excluded from the build and an issue will be filed against it, + with the author/maintainer notified. If the issue is not resolved + within 90 days, the tutorial might be deleted from the repository. + +* We recommend that each tutorial is reviewed at least once a year to + ensure its relevance. + +## Deleting Stale Tutorials + +A tutorial might be considered stale when it no longer aligns with +the latest PyTorch updates, features, or best practices: + +* The tutorial is no longer functional due to changes in PyTorch or + its dependencies +* The tutorial has been superseded by a newer, more comprehensive, or + more accurate tutorial +* The tutorial does not run successfully in the (CI), indicating + potential compatibility or dependency issues. + +If a tutorial is deemed stale, we will attempt to contact the code owner, +or someone from the tutorial mainatainers might attempt to update it. +However, if despite those attempts we fail to fix it, the tutorial +might be removed from the repository. diff --git a/unstable_index.rst b/unstable_index.rst new file mode 100644 index 00000000000..6e3cfd4364c --- /dev/null +++ b/unstable_index.rst @@ -0,0 +1,164 @@ +Unstable +======== + +API unstable features are not available as part of binary distributions +like PyPI or Conda (except maybe behind run-time flags). To test these +features we would, depending on the feature, recommend building PyTorch +from source (main) or using the nightly wheels that are made +available on `pytorch.org `_. + +*Level of commitment*: We are committing to gathering high bandwidth +feedback only on these features. Based on this feedback and potential +further engagement between community members, we as a community will +decide if we want to upgrade the level of commitment or to fail fast. + + +.. raw:: html + +
+ + + +
+ +
+ +
+
+ +.. Add prototype tutorial cards below this line + +.. vmap + +.. customcarditem:: + :header: Using torch.vmap + :card_description: Learn about torch.vmap, an autovectorizer for PyTorch operations. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/vmap_recipe.html + :tags: vmap + +.. NestedTensor + +.. customcarditem:: + :header: Nested Tensor + :card_description: Learn about nested tensors, the new way to batch heterogeneous-length data + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/nestedtensor.html + :tags: NestedTensor + +.. MaskedTensor + +.. customcarditem:: + :header: MaskedTensor Overview + :card_description: Learn about masked tensors, the source of truth for specified and unspecified values + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/maskedtensor_overview.html + :tags: MaskedTensor + +.. customcarditem:: + :header: Masked Tensor Sparsity + :card_description: Learn about how to leverage sparse layouts (e.g. COO and CSR) in MaskedTensor + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/maskedtensor_sparsity.html + :tags: MaskedTensor + +.. customcarditem:: + :header: Masked Tensor Advanced Semantics + :card_description: Learn more about Masked Tensor's advanced semantics (reductions and comparing vs. NumPy's MaskedArray) + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/maskedtensor_advanced_semantics.html + :tags: MaskedTensor + +.. customcarditem:: + :header: MaskedTensor: Simplifying Adagrad Sparse Semantics + :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/maskedtensor_adagrad.html + :tags: MaskedTensor + +.. Model-Optimization + +.. customcarditem:: + :header: Inductor Cpp Wrapper Tutorial + :card_description: Speed up your models with Inductor Cpp Wrapper + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/inductor_cpp_wrapper_tutorial.html + :tags: Model-Optimization + +.. customcarditem:: + :header: Inductor Windows CPU Tutorial + :card_description: Speed up your models with Inductor On Windows CPU + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/inductor_windows.html + :tags: Model-Optimization + +.. customcarditem:: + :header: Use max-autotune compilation on CPU to gain additional performance boost + :card_description: Tutorial for max-autotune mode on CPU to gain additional performance boost + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/max_autotune_on_CPU_tutorial.html + :tags: Model-Optimization + +.. Distributed +.. customcarditem:: + :header: Flight Recorder Tutorial + :card_description: Debug stuck jobs easily with Flight Recorder + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/flight_recorder_tutorial.html + :tags: Distributed, Debugging, FlightRecorder + +.. customcarditem:: + :header: Context Parallel Tutorial + :card_description: Parallelize the attention computation along sequence dimension + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/context_parallel.html + :tags: Distributed, Context Parallel + +.. Integration +.. customcarditem:: + :header: Out-of-tree extension autoloading in Python + :card_description: Learn how to improve the seamless integration of out-of-tree extension with PyTorch based on the autoloading mechanism. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/python_extension_autoload.html + :tags: Extending-PyTorch, Frontend-APIs + +.. GPUDirect Storage +.. customcarditem:: + :header: (prototype) Using GPUDirect Storage + :card_description: Learn how to use GPUDirect Storage in PyTorch. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: unstable/gpu_direct_storage.html + :tags: GPUDirect-Storage + +.. End of tutorial card section + +.. ----------------------------------------- +.. Page TOC +.. ----------------------------------------- + +.. toctree:: + :maxdepth: 2 + :hidden: + + unstable/context_parallel + unstable/flight_recorder_tutorial + unstable/inductor_cpp_wrapper_tutorial + unstable/inductor_windows + unstable/vmap_recipe + unstable/nestedtensor + unstable/maskedtensor_overview + unstable/maskedtensor_sparsity + unstable/maskedtensor_advanced_semantics + unstable/maskedtensor_adagrad + unstable/python_extension_autoload + unstable/gpu_direct_storage.html + unstable/max_autotune_on_CPU_tutorial + unstable/skip_param_init.html diff --git a/prototype_source/README.md b/unstable_source/README.md similarity index 100% rename from prototype_source/README.md rename to unstable_source/README.md diff --git a/unstable_source/README.txt b/unstable_source/README.txt new file mode 100644 index 00000000000..55a94b43626 --- /dev/null +++ b/unstable_source/README.txt @@ -0,0 +1,13 @@ +Prototype Tutorials +------------------ +1. distributed_rpc_profiling.rst + Profiling PyTorch RPC-Based Workloads + https://github.com/pytorch/tutorials/blob/main/unstable_source/distributed_rpc_profiling.rst + +2. flight_recorder_tutorial.rst + Flight Recorder User Guide + https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html + +3. python_extension_autoload.rst + Autoloading Out-of-Tree Extension + https://github.com/pytorch/tutorials/blob/main/unstable_source/python_extension_autoload.rst diff --git a/unstable_source/context_parallel.rst b/unstable_source/context_parallel.rst new file mode 100644 index 00000000000..b12a4030016 --- /dev/null +++ b/unstable_source/context_parallel.rst @@ -0,0 +1,228 @@ +Introduction to Context Parallel +====================================== +**Authors**: `Xilun Wu `_, `Chien-Chin Huang `__ + +.. note:: + |edit| View and edit this tutorial in `GitHub `__. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * `Context Parallel APIs `__ + * `1M sequence training in TorchTitan with Context Parallel `__ + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.7 or later + + +Introduction +------------ + +Context Parallel is an approach used in large language model training to reduce peak activation size by sharding the long input sequence across multiple devices. +It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks. + +Ring Attention, a novel parallel implementation of the Attention layer, is critical to performant Context Parallel. +Ring Attention shuffles the KV shards and calculates the partial attention scores, repeats until all KV shards have been used on each device. +Two Ring Attention variants have been implemented: `the all-gather based pass-KV `__ and `the all-to-all based pass-KV `__: + +1. The all-gather based pass-KV algorithm is used in Llama3 training, which initially performs an all-gather on the key and value tensors, followed by computing the attention output for the + local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk + using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of + overlap between the attention computation and the all-gather collective. For example, in the case of Llama3 training, we also shard ``freq_cis`` over the sequence dimension. +2. The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA (Scaled Dot Product Attention) computation and the all-to-all communication + necessary for the next SDPA. + +The Context Parallel APIs consist of two parts: + +1. ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``) + will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to + argument ``buffers`` and ``buffer_seq_dims`` respectively. We recommend that users add tensors computing along the sequence dimension to ``buffers`` + and shard them along this dimension. Taking Llama3 training as an example, missing ``freq_cis`` in ``buffers`` will result in a miscalculated rotary embedding. +2. ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach. + + +Setup +--------------------- + +With ``torch.distributed.tensor.experimental.context_parallel()``, users can easily shard the Tensor input and parallelize the execution of the SDPA function. +To better demonstrate the usage of this API, we start with a simple code snippet doing SDPA and then parallelize it using the API: + +.. code:: python + + import torch + import torch.nn.functional as F + + from torch.nn.attention import sdpa_kernel, SDPBackend + + + def sdpa_example(): + assert torch.cuda.is_available() + torch.cuda.set_device("cuda:0") + torch.cuda.manual_seed(0) + + batch = 8 + nheads = 8 + qkv_len = 8192 + dim = 32 + backend = SDPBackend.FLASH_ATTENTION + dtype = ( + torch.bfloat16 + if backend == SDPBackend.FLASH_ATTENTION + or backend == SDPBackend.CUDNN_ATTENTION + else torch.float32 + ) + + qkv = [ + torch.rand( + (batch, nheads, qkv_len, dim), + dtype=dtype, + requires_grad=True, + device='cuda', + ) + for _ in range(3) + ] + # specify the SDPBackend to use + with sdpa_kernel(backend): + out = F.scaled_dot_product_attention(*qkv, is_causal=True) + + + if __name__ == "__main__": + sdpa_example() + + +Enable Context Parallel +----------------------- + +Now, let's first adapt it to a distributed program where each rank has the same tensor input. Then we apply the context parallel API to +shard to input and distribute the computation across ranks: + +.. code:: python + + # file: cp_sdpa_example.py + import os + + import torch + import torch.distributed as dist + import torch.nn.functional as F + from torch.distributed.device_mesh import init_device_mesh + from torch.distributed.tensor.experimental import context_parallel + from torch.distributed.tensor.experimental._attention import context_parallel_unshard + from torch.nn.attention import sdpa_kernel, SDPBackend + + + def context_parallel_sdpa_example(world_size: int, rank: int): + assert torch.cuda.is_available() + assert dist.is_nccl_available() + torch.cuda.set_device(f"cuda:{rank}") + torch.cuda.manual_seed(0) + + dist.init_process_group( + backend="nccl", + init_method="env://", + world_size=world_size, + rank=rank, + ) + device_mesh = init_device_mesh( + device_type="cuda", mesh_shape=(world_size,), mesh_dim_names=("cp",) + ) + + batch = 8 + nheads = 8 + qkv_len = 64 + dim = 32 + backend = SDPBackend.FLASH_ATTENTION + dtype = ( + torch.bfloat16 + if backend == SDPBackend.FLASH_ATTENTION + or backend == SDPBackend.CUDNN_ATTENTION + else torch.float32 + ) + + qkv = [ + torch.rand( + (batch, nheads, qkv_len, dim), + dtype=dtype, + requires_grad=True, + device='cuda', + ) + for _ in range(3) + ] + # specify the SDPBackend to use + with sdpa_kernel(backend): + out = F.scaled_dot_product_attention(*qkv, is_causal=True) + + # make a clean copy of QKV for output comparison + cp_qkv = [t.detach().clone() for t in qkv] + + with sdpa_kernel(backend): + # This `context_parallel()` performs two actions: + # 1. Shard the tensor objects in `buffers` in-place along the dimension + # specified in `buffer_seq_dims`, the tensors in `buffers` and their + # sharding dims in `buffer_seq_dims` are organized in the same order. + # 2. Replace the execution of `F.scaled_dot_product_attention` with a + # context-paralleled-enabled Ring Attention. + with context_parallel( + device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2) + ): + cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True) + + # The output `cp_out` is still sharded in the same way as QKV + # the `context_parallel_unshard` API allows users to easily + # unshard to gain the full tensor. + (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2]) + + assert torch.allclose( + cp_out, + out, + atol=(1e-08 if dtype == torch.float32 else 1e-03 * world_size), + ) + + + if __name__ == "__main__": + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + try: + context_parallel_sdpa_example(world_size, rank) + finally: + dist.barrier() + dist.destroy_process_group() + + +You can use the command ``torchrun --standalone --nnodes=1 --nproc-per-node=4 cp_sdpa_example.py`` to launch the above context parallel +SDPA on 4 GPUs. We demonstrate the numeric correctness by comparing the output of Ring Attention to that of SDPA on a single GPU. + + +Select Rotation Approach +------------------------ + +You can choose the desired shards rotation approach in Ring Attention by using ``torch.distributed.tensor.experimental._attention.set_rotate_method()``: + +.. code:: python + + # file: cp_sdpa_example.py + from torch.distributed.tensor.experimental._attention import set_rotate_method + + set_rotate_method("alltoall") # rotate shards using all-to-all + + with sdpa_kernel(backend): + with context_parallel( + device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2) + ): + cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True) + + +The default rotation approach is the all-gather based pass-KV. + + +Conclusion +---------- + +In this tutorial, we have learned how to parallelize the SDPA computation along the sequence dimension easily with our Context Parallel APIs. For +design and implementation details, performance analysis, and an end-to-end training example in `TorchTitan `__, +see our post on `PyTorch native long-context training `__. diff --git a/prototype_source/distributed_rpc_profiling.rst b/unstable_source/distributed_rpc_profiling.rst similarity index 100% rename from prototype_source/distributed_rpc_profiling.rst rename to unstable_source/distributed_rpc_profiling.rst diff --git a/unstable_source/flight_recorder_tutorial.rst b/unstable_source/flight_recorder_tutorial.rst new file mode 100644 index 00000000000..35477779083 --- /dev/null +++ b/unstable_source/flight_recorder_tutorial.rst @@ -0,0 +1,304 @@ +Flight Recorder for Debugging Stuck Jobs +==================================================== +**Author**: `Chirag Pandya `_, `Junjie Wang `_ + +What you will learn +------------------- +* Learn about a new tool for debugging stuck jobs during distributed training. +* Learn how you can enable the tool and use the collected data for analyzing stuck jobs. + +Prerequisites +------------- + +- PyTorch version 2.5 or later. +- `tabulate `__. You can install by running ``pip install tabulate``. + + +Overview +-------- +An AI distributed training job refers to the process of training a machine learning model using multiple devices, such +as GPUs or CPUs, connected in a network. This approach allows for faster and more efficient training of large models +that require significant computational resources. +An engineer’s goal is to complete an AI training job as quickly as possible and make continuous improvements so that +subsequent training can be done faster. A trained, usable model is the final desired outcome. +One of the biggest impediment to completing training is the concept of a *stuck job*. + +A distributed AI training job is considered `stuck` when it stops making meaningful progress for an extended period of +time. + +A job can get stuck for various reasons: + +- **Data Starvation:** This occurs when the training job is not receiving data at the expected rate, possibly due to issues with the data pipeline or the data source. + +- **Resource Constraints:** If the system running the job does not have enough computational resources (such as CPU, GPU, or memory), the job might not be able to proceed. + +- **Network Issues:** In a distributed training setup, different parts of the model or data may be processed on different devices. If there are network issues, communication between these devices may be disrupted, causing the job to get stuck. + +- **Software Bugs or Errors:** Errors in the training code or the underlying libraries and frameworks can also cause a job to get stuck. + +- **Synchronization Issues:** In distributed training, different parts of the computation are often run in parallel and need to be synchronized at certain points. If this synchronization fails, the job can get stuck. For example, a deadlock can occur if one or more ranks fail to join a collective while the remaining ranks have joined. This results in an indefinite wait for the job to progress. + +Flight Recorder, as the name suggests, captures diagnostics information as collectives run. The captured diagnostic +information is used to help identify the root causes of issues when jobs become stuck. +Flight Recorder consists of two core parts: + +- The collection portion: when enabled, information about collectives is recorded in an in-memory circular buffer. Upon job timeout, or on demand, the in-memory buffer can be retrieved or dumped to file. + +- An analyzer script is available in the `tools/flight_recorder `__ directory (details below). + The analyzer script runs known heuristics using the collected data and attempts to automatically identify the underlying issue that caused the job to stall. + +Enabling Flight Recorder +------------------------ +There are three required environment variables to get the initial version of Flight Recorder working. + +- ``TORCH_NCCL_TRACE_BUFFER_SIZE = (0, N)``: Setting ``N`` to a positive number enables collection. + ``N`` represents the number of entries that will be kept internally in a circular buffer. + We recommended to set this value at *2000*. The default value is ``2000``. +- ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout. + If enabled, there will be one file per rank output in the job's running directory. The default value is ``false``. +- ``TORCH_FR_DUMP_TEMP_FILE``: Setting the path where the flight recorder will be dumped with file prefix. One file per + rank. The default value is ``/tmp/nccl_trace_rank_``. + +**Optional settings:** + +- ``TORCH_NCCL_TRACE_CPP_STACK = (true, false)``: Setting this to true enables C++ stack traces to be captured in Flight Recorder. + C++ stack traces can be useful in providing the exact code path from a PyTorch Python call down to the primitive + C++ implementation. Also see ``TORCH_SYMBOLIZE_MODE`` in additional settings. +- ``TORCH_NCCL_ENABLE_TIMING = (true, false)``: Setting this to ``true`` will enable additional cuda events at the start of each collective and + records the *duration* of each collective. This may incur some CPU overhead. In the collected data, the + *duration* field indicates how long each collective took to execute. + +Additional Settings +------------------- + +- ``TORCH_SYMBOLIZE_MODE = (dladdr, addr2line, fast)``: This setting determines the program used to retrieve C++ traces from a running program. + The default setting is ``addr2line``. + + ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``. + Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data. +- If you prefer not to have the flight recorder data dumped into the local disk but rather onto your own storage, you can define your own writer class. + This class should inherit from class ``::c10d::DebugInfoWriter`` `(code) `__ + and then register the new writer using ``::c10d::DebugInfoWriter::registerWriter`` `(code) `__ + before we initiate PyTorch distributed. + +Retrieving Flight Recorder Data via an API +------------------------------------------ + +You can also retrieve Flight Recorder data with an API call. +The API with the default arguments is shown below: + +.. code:: python + + torch._C._distributed_c10d._dump_nccl_trace(includeCollectives=True, includeStackTraces=True, onlyActive=False) + +To view the data, you can ``unpickle`` it as shown below: + +.. code:: python + + t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace()) + print(t) + +Flight Recorder File Formats +---------------------------- + +Flight Recorder files are dumped in ``pickle`` format. Files are written to local disks or mounted shared NFS +folders. + +The contents of a Flight Recorder ``unpickled`` file are shown below: + +.. code-block:: json + + { + "version": "2.5", + "pg_config": { + "0": { + "name": "0", + "desc": "default_pg", + "ranks": "[0, 1]" + } + }, + "pg_status": { + "0": { + "last_enqueued_collective": 2, + "last_started_collective": -1, + "last_completed_collective": 2 + } + }, + "entries": [ + { + "frames": [ + { + "name": "test_short_pickle", + "filename": "pytorch/test/distributed/test_c10d_nccl.py", + "line": 3647 + }, + { + "name": "spawn_main", + "filename": ".conda/envs/pytorch-3.10/lib/python3.10/multiprocessing/spawn.py", + "line": 116 + }, + { + "name": "", + "filename": "", + "line": 1 + } + ], + "record_id": 0, + "pg_id": 0, + "process_group": ("0", "default_pg"), + "collective_seq_id": 1, + "p2p_seq_id": 0, + "op_id": 1, + "profiling_name": "nccl:all_reduce", + "time_created_ns": 1724779239936775119, + "input_sizes": [[3, 4]], + "input_dtypes": ["Float"], + "output_sizes": [[3, 4]], + "output_dtypes": ["Float"], + "state": "completed", + "time_discovered_started_ns": null, + "time_discovered_completed_ns": 1724779239975811724, + "retired": true, + "timeout_ms": 600000, + "is_p2p": false + }, + ... + ] + } + +Analyzing Flight Recorder Dumps +------------------------------- + +We have convenient scripts available in `pytorch/tools/flight_recorder` directory for analyzing captured +data. + +To run the convenience script, follow these steps: + +1. Copy all files from a rank into a single directory. + +2. To run the script, use this command: + +.. code:: shell + + python fr_trace.py [-o ] + +If you install the PyTorch nightly build or build from scratch with ``USE_DISTRIBUTED=1``, you can directly use the following +command directly: + +.. code:: shell + + torchfrtrace [-o ] + + +Currently, we support two modes for the analyzer script. The first mode allows the script to apply some heuristics to the parsed flight +recorder dumps to generate a report identifying potential culprits for the timeout. The second mode is simply outputs the raw dumps. +By default, the script prints flight recoder dumps for all ranks and all ``ProcessGroups``(PGs). This can be narrowed down to certain +ranks and PGs using the *--selected-ranks* argument for ranks and *--pg-filters* argument for PGs. An example command is: + +Caveat: tabulate module is needed, so you might need pip install it first. + +.. code:: shell + + python fr_trace.py -j [--selected-ranks i j k ...] [--pg-filters tp dp] + torchfrtrace -j [--selected-ranks i j k ...] [--pg-filters 0 2] + +An End-to-End Example +------------------------------------ +To demonstrate the use of Flight Recorder, we will use a small program where we induce mismatched collectives. +In this example, ``rank0`` is programmed to do an additional collective. +The Flight Recorder dump files are saved to the ``/tmp`` directory. +For demonstration purposes, we named this program ``crash.py``. + +.. note:: + Please note that this is a simplified example. In real-world scenarios, the process would involve more + complexities. + +.. code:: python + + import torch + import torch.distributed as dist + import os + from datetime import timedelta + + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + assert world_size <= 8, "world size must be less than or equal to 8" + os.environ["TORCH_NCCL_DEBUG_INFO_TEMP_FILE"] = "/tmp/trace_" + os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1" + os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "2000" + device = torch.device(f"cuda:{local_rank}") + print(f"{local_rank=} {world_size=} master addr: {os.environ['MASTER_ADDR']} master port: {os.environ['MASTER_PORT']} {device=}") + + # Initialize the process group with a small timeout so that jobs fail quickly + dist.init_process_group("nccl", world_size=world_size, rank=local_rank, timeout=timedelta(seconds=1)) + + a = torch.full((3, 4), float(local_rank), device=device) + # Write some collectives to populate Flight Recorder data + for i in range(2): + print(f"calling allreduce on {local_rank=}") + f = dist.all_reduce(a) + + # rank0 is doing an additional collective + if local_rank == 0: + print("rank0 is doing an allreduce on tensor b, but other ranks forgot") + b = torch.full((4,5), float(local_rank), device=device) + f = dist.all_reduce(b) + + for i in range(2): + print(f"calling allreduce on {local_rank=}") + f = dist.all_reduce(a) + + torch.cuda.synchronize(device=device) + print(f"{local_rank=} exiting") + + +To run this program, use ``torchrun``: + + +.. code:: python + + torchrun --nnodes=1 --nproc_per_node=2 crash.py + +You should see two files in the ``/tmp`` directory: + +.. code:: bash + + $ls /tmp/trace* + # Expected output + /tmp/trace_0 /tmp/trace_1 + +Finally, to analyze these two files, we use the ``torchfrtrace`` command: + +.. code:: bash + + torchfrtrace --prefix "trace_" /tmp/ + +The output from the trace command is meant to be human-readable. It includes information about the +set of collectives that caused a failure. +The output for the command above is shown below. +We can clearly see that rank 1 did not join the "all_reduce" collective. + +.. code-block:: bash + $torchfrtrace --prefix "trace_" /tmp/ + Not all ranks joining collective 5 at entry 4 + group info: 0:default_pg + collective: nccl:all_reduce + missing ranks: {1} + input sizes: [[3, 4]] + output sizes: [[3, 4]] + expected ranks: 2 + collective state: scheduled + collective stack trace: + all_reduce at /home/cpio/local/pytorch/torch/distributed/distributed_c10d.py:2696 + wrapper at /home/cpio/local/pytorch/torch/distributed/c10d_logger.py:83 + at /home/cpio/test/crash.py:44 + + + +Conclusion +---------- +In this tutorial, we have learned about a new PyTorch diagnostic tool called Flight Recorder. +We have discussed how to enable Flight Recorder to collect diagnostic data from a machine. +Additionally, we explored how to analyze the data captured from the Flight Recorder using a +convenience script located in the `tools/flight_recorder `__ +directory of the PyTorch repository. diff --git a/unstable_source/gpu_direct_storage.py b/unstable_source/gpu_direct_storage.py new file mode 100644 index 00000000000..2b06c53bc7f --- /dev/null +++ b/unstable_source/gpu_direct_storage.py @@ -0,0 +1,132 @@ +""" +(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage +================================================================================= + +GPUDirect Storage enables a direct data path for direct memory access transfers +between GPU memory and storage, avoiding a bounce buffer through the CPU. + +In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around +the `cuFile APIs `_ +that can be used with ``torch.Tensor`` to achieve improved I/O performance. + +In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with +checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Understand how to use the ``torch.cuda.gds`` APIs in conjunction with + checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v.2.7.0 or later + * GPUDirect Storage must be installed per + `the documentation `_ + * Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage. +""" + +################################################################################ +# Using GPUDirect Storage with ``torch.save`` and ``torch.load`` +# ------------------------------------------------------------------------------------ +# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using +# ``torch.utils.serialization.config.save.storage_alignment``: + +import torch +from torch.utils.serialization import config as serialization_config + +serialization_config.save.storage_alignment = 4096 + +################################################################################ +# The steps involved in the process are as follows: +# * Write the checkpoint file without any actual data. This reserves the space on disk. +# * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``. +# * Use ``GDSFile`` to write the appropriate data at these offsets. +# +# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context +# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage`` +# in the state dictionary, space will be reserved within the checkpoint for the storage bytes. + +import torch.nn as nn + +m = nn.Linear(5, 10, device='cuda') +sd = m.state_dict() + +with torch.serialization.skip_data(): + torch.save(sd, "checkpoint.pt") + +################################################################################ +# We can get the offsets that each storage should be written to within the checkpoint by loading under +# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device) +# information about the tensor but does not have any storage bytes. The following snippet will not materialize +# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that +# corresponds to the tensor. +# +# If you are continuously saving the same state dictionary during training, you +# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to +# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps +# ``cuFileBufRegister`` to register the storages as GDS buffers. +# +# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API, +# so no synchronization is needed afterwards. + + +import os +from torch._subclasses.fake_tensor import FakeTensorMode + +with FakeTensorMode() as mode: + fake_sd = torch.load("checkpoint.pt") + +for k, v in fake_sd.items(): + print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}") + +f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR) + +for k, v in sd.items(): + offset = fake_sd[k].untyped_storage()._checkpoint_offset + # save_storage is a wrapper around `cuFileWrite` + f.save_storage(v.untyped_storage(), offset) + + +################################################################################ +# We verify correctness of the saved checkpoint by ``torch.load`` and comparing. + +sd_loaded = torch.load("checkpoint.pt") +for k, v in sd_loaded.items(): + assert torch.equal(v, sd[k]) + +################################################################################ +# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context +# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be +# created but their storages will be empty (as if the tensors were created via ``torch.empty``). + +with torch.serialization.skip_data(): + sd_loaded = torch.load("checkpoint.pt") + +################################################################################ +# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and +# ascertain that the loaded checkpoint is the same as the saved checkpoint. +# +# Similar to ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage`` +# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards. + +for k, v in sd_loaded.items(): + assert not torch.equal(v, sd[k]) + offset = fake_sd[k].untyped_storage()._checkpoint_offset + # load_storage is a wrapper around `cuFileRead` + f.load_storage(v.untyped_storage(), offset) + +for k, v in sd_loaded.items(): + assert torch.equal(v, sd[k]) + +del f +########################################################## +# Conclusion +# ========== +# +# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs +# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please +# file an issue in the PyTorch GitHub repo if you have any feedback. diff --git a/prototype_source/gpu_quantization_torchao_tutorial.py b/unstable_source/gpu_quantization_torchao_tutorial.py similarity index 88% rename from prototype_source/gpu_quantization_torchao_tutorial.py rename to unstable_source/gpu_quantization_torchao_tutorial.py index 513d54faba7..874f3227636 100644 --- a/prototype_source/gpu_quantization_torchao_tutorial.py +++ b/unstable_source/gpu_quantization_torchao_tutorial.py @@ -7,7 +7,7 @@ In this tutorial, we will walk you through the quantization and optimization of the popular `segment anything model `_. These steps will mimic some of those taken to develop the -`segment-anything-fast `_ +`segment-anything-fast `_ repo. This step-by-step guide demonstrates how you can apply these techniques to speed up your own models, especially those that use transformers. To that end, we will focus on widely applicable @@ -31,22 +31,21 @@ # > conda create -n myenv python=3.10 # > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 # > pip install git+https://github.com/facebookresearch/segment-anything.git -# > pip install git+https://github.com/pytorch-labs/ao.git +# > pip install git+https://github.com/pytorch/ao.git # # Segment Anything Model checkpoint setup: # -# 1. Go to the `segment-anything repo `_ and download the ``vit_h`` checkpoint. Alternatively, you can just use ``wget``: `wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix= +# 1. Go to the `segment-anything repo checkpoint `_ and download the ``vit_h`` checkpoint. Alternatively, you can use ``wget`` (for example, ``wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix=``). # 2. Pass in that directory by editing the code below to say: # -# .. code-block:: -# -# {sam_checkpoint_base_path}= +# .. code-block:: bash # -# This was run on an A100-PG509-200 power limited to 330.00 W +# {sam_checkpoint_base_path}= # import torch -from torchao.quantization import change_linear_weights_to_int8_dqtensors +from torchao.quantization.quant_api import quantize_, Int8DynamicActivationInt8WeightConfig +from torchao.utils import unwrap_tensor_subclass, TORCH_VERSION_AT_LEAST_2_5 from segment_anything import sam_model_registry from torch.utils.benchmark import Timer @@ -144,7 +143,7 @@ def get_sam_model(only_one_block=False, batchsize=1): # for improvements. # # Next, let's apply quantization. Quantization for GPUs comes in three main forms -# in `torchao `_ which is just native +# in `torchao `_ which is just native # pytorch+python code. This includes: # # * int8 dynamic quantization @@ -158,9 +157,9 @@ def get_sam_model(only_one_block=False, batchsize=1): # in memory bound situations where the benefit comes from loading less # weight data, rather than doing less computation. The torchao APIs: # -# ``change_linear_weights_to_int8_dqtensors``, -# ``change_linear_weights_to_int8_woqtensors`` or -# ``change_linear_weights_to_int4_woqtensors`` +# ``Int8DynamicActivationInt8WeightConfig()``, +# ``Int8WeightOnlyConfig()`` or +# ``Int4WeightOnlyConfig()`` # # can be used to easily apply the desired quantization technique and then # once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is @@ -172,7 +171,7 @@ def get_sam_model(only_one_block=False, batchsize=1): # ``apply_weight_only_int8_quant`` instead as drop in replacement for the two # above (no replacement for int4). # -# The difference between the two APIs is that ``change_linear_weights`` API +# The difference between the two APIs is that the ``Int8DynamicActivationInt8WeightConfig`` API # alters the weight tensor of the linear module so instead of doing a # normal linear, it does a quantized operation. This is helpful when you # have non-standard linear ops that do more than one thing. The ``apply`` @@ -187,7 +186,10 @@ def get_sam_model(only_one_block=False, batchsize=1): model, image = get_sam_model(only_one_block, batchsize) model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) -change_linear_weights_to_int8_dqtensors(model) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) model_c = torch.compile(model, mode='max-autotune') quant_res = benchmark(model_c, image) print(f"bf16 compiled runtime of the quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") @@ -222,7 +224,10 @@ def get_sam_model(only_one_block=False, batchsize=1): model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) torch._inductor.config.force_fuse_int_mm_with_mul = True -change_linear_weights_to_int8_dqtensors(model) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) model_c = torch.compile(model, mode='max-autotune') quant_res = benchmark(model_c, image) print(f"bf16 compiled runtime of the fused quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") @@ -253,7 +258,10 @@ def get_sam_model(only_one_block=False, batchsize=1): torch._inductor.config.coordinate_descent_tuning = True torch._inductor.config.coordinate_descent_check_all_directions = True torch._inductor.config.force_fuse_int_mm_with_mul = True -change_linear_weights_to_int8_dqtensors(model) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) model_c = torch.compile(model, mode='max-autotune') quant_res = benchmark(model_c, image) print(f"bf16 compiled runtime of the final quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") @@ -282,7 +290,10 @@ def get_sam_model(only_one_block=False, batchsize=1): model, image = get_sam_model(False, batchsize) model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) - change_linear_weights_to_int8_dqtensors(model) + quantize_(model, Int8DynamicActivationInt8WeightConfig()) + if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) model_c = torch.compile(model, mode='max-autotune') quant_res = benchmark(model_c, image) print(f"bf16 compiled runtime of the quantized full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") @@ -297,13 +308,13 @@ def get_sam_model(only_one_block=False, batchsize=1): # ----------------- # In this tutorial, we have learned about the quantization and optimization techniques # on the example of the segment anything model. - +# # In the end, we achieved a full-model apples to apples quantization speedup # of about 7.7% on batch size 16 (677.28ms to 729.65ms). We can push this a # bit further by increasing the batch size and optimizing other parts of # the model. For example, this can be done with some form of flash attention. # # For more information visit -# `torchao `_ and try it on your own +# `torchao `_ and try it on your own # models. # diff --git a/unstable_source/inductor_cpp_wrapper_tutorial.rst b/unstable_source/inductor_cpp_wrapper_tutorial.rst new file mode 100644 index 00000000000..9b522a8947b --- /dev/null +++ b/unstable_source/inductor_cpp_wrapper_tutorial.rst @@ -0,0 +1,189 @@ +TorchInductor C++ Wrapper Tutorial +============================================================== + +**Author**: `Chunyuan Wu `_, `Bin Bao `__, `Jiong Gong `__ + +Prerequisites: +---------------- +- `torch.compile and TorchInductor concepts in PyTorch `__ + +Introduction +------------ + +In ``torch.compile``, the default backend **TorchInductor** emits Python wrapper +code that manages memory allocation and kernel invocation. This design provides +flexibility and ease of debugging, but the interpreted nature of Python +introduces runtime overhead in performance-sensitive environments. + +To address this limitation, TorchInductor includes a specialized mode that +generates **C++ wrapper code** in place of the Python wrapper, enabling faster +execution with minimal Python involvement. + + +Enabling the C++ wrapper mode +---------------- +To enable this C++ wrapper mode for TorchInductor, add the following config to your code: + +.. code:: python + + import torch._inductor.config as config + config.cpp_wrapper = True + + +Example code +------------ + +We will use the following model code as an example: + +.. code:: python + + import torch + import torch._inductor.config as config + + config.cpp_wrapper = True + + def fn(x, y): + return (x + y).sum() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + x = torch.randn(128, 128, device=device) + y = torch.randn(128, 128, device=device) + + opt_fn = torch.compile(fn) + result = opt_fn(x, y) + + +**For CPU** + +The main part of TorchInductor-generated code with the default Python wrapper will look like this: + +.. code:: python + + class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (128, 128), (128, 1)) + assert_size_stride(arg1_1, (128, 128), (128, 1)) + buf0 = empty_strided_cpu((), (), torch.float32) + cpp_fused_add_sum_0(arg0_1, arg1_1, buf0) + del arg0_1 + del arg1_1 + return (buf0, ) + +By turning on the C++ wrapper, the generated code for the ``call`` function becomes a C++ function +``inductor_entry_impl``: + +.. code:: python + + cpp_wrapper_src = ( + r''' + #include + extern "C" void cpp_fused_add_sum_0(const float* in_ptr0, + const float* in_ptr1, + float* out_ptr0); + CACHE_TORCH_DTYPE(float32); + CACHE_TORCH_DEVICE(cpu); + + void inductor_entry_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed) + ) { + py::gil_scoped_release_simple release; + + auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 2); + auto arg0_1 = std::move(inputs[0]); + auto arg1_1 = std::move(inputs[1]); + static constexpr int64_t *int_array_0=nullptr; + AtenTensorHandle buf0_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(0, int_array_0, int_array_0, cached_torch_dtype_float32, cached_torch_device_type_cpu, 0, &buf0_handle)); + RAIIAtenTensorHandle buf0(buf0_handle); + cpp_fused_add_sum_0((const float*)(arg0_1.data_ptr()), (const float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr())); + arg0_1.reset(); + arg1_1.reset(); + output_handles[0] = buf0.release(); + } // inductor_entry_impl + ... + ''' + ) + + inductor_entry = CppWrapperCodeCache.load_pybinding( + argtypes=["std::vector"], + main_code=cpp_wrapper_src, + device_type="cpu", + num_outputs=1, + kernel_code=None, + ) + + call = _wrap_func(inductor_entry) + +**For GPU** + +Based on the same example code, the generated code for GPU will look like this: + +.. code:: python + + def call(args): + arg0_1, = args + args.clear() + assert_size_stride(arg0_1, (1, ), (1, )) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) # no-op to ensure context + buf0 = empty_strided((19, ), (1, ), device='cuda', dtype=torch.float32) + # Source Nodes: [add, tensor], Original ATen: [aten.add, aten.lift_fresh] + stream0 = get_cuda_stream(0) + triton_poi_fused_add_lift_fresh_0.run(constant0, arg0_1, buf0, 19, grid=grid(19), stream=stream0) + run_intermediate_hooks('add', buf0) + del arg0_1 + return (buf0, ) + +With the C++ wrapper turned on, the below equivalent C++ code will be generated: + +.. code:: python + + inductor_entry = CppWrapperCodeCache.load_pybinding( + argtypes=["std::vector"], + main_code=cpp_wrapper_src, + device_type="cuda", + num_outputs=1, + kernel_code=None, + ) + + def _wrap_func(f): + def g(args): + input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg, device='cpu') for arg in args] + input_handles = torch._C._aoti.unsafe_alloc_void_ptrs_from_tensors(input_tensors) + + args.clear() + del input_tensors + + output_handles = f(input_handles) + output_tensors = torch._C._aoti.alloc_tensors_by_stealing_from_void_ptrs(output_handles) + return output_tensors + + return g + + call = _wrap_func(inductor_entry) + + +Conclusion +------------ + +This tutorial introduced the **C++ wrapper** feature in TorchInductor, designed +to improve model performance with minimal code modification. We described the +motivation for this feature, detailed the experimental API used to enable it, +and compared the generated outputs of the default Python wrapper and the new +C++ wrapper on both CPU and GPU backends to illustrate their distinctions. + +.. For more information on torch.compile, see +.. +.. .. _torch.compile tutorial: https://docs.pytorch.org/tutorials/intermediate/torch_compile_tutorial.html +.. .. TORCH_LOGS tutorial: https://docs.pytorch.org/tutorials/recipes/torch_logs.html diff --git a/unstable_source/inductor_windows.rst b/unstable_source/inductor_windows.rst new file mode 100644 index 00000000000..871cc48a33e --- /dev/null +++ b/unstable_source/inductor_windows.rst @@ -0,0 +1,105 @@ +How to use ``torch.compile`` on Windows CPU/XPU +=============================================== + +**Author**: `Zhaoqiong Zheng `_, `Xu, Han `_ + + +Introduction +------------ + +TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels. + +This tutorial introduces the steps for using TorchInductor via ``torch.compile`` on Windows CPU/XPU. + + +Software Installation +--------------------- + +Now, we will walk you through a step-by-step tutorial for how to use ``torch.compile`` on Windows CPU/XPU. + +Install a Compiler +^^^^^^^^^^^^^^^^^^ + +C++ compiler is required for TorchInductor optimization, let's take Microsoft Visual C++ (MSVC) as an example. + +#. Download and install `MSVC `_. + +#. During Installation, select **Workloads** and then **Desktop & Mobile**. Select a checkmark on **Desktop Development with C++** and install. + +.. image:: ../_static/img/install_msvc.png + + +.. note:: + + Windows CPU inductor also support C++ compiler `LLVM Compiler `_ and `Intel Compiler `_ for better performance. + Please check `Alternative Compiler for better performance on CPU <#alternative-compiler-for-better-performance>`_. + +Set Up Environment +^^^^^^^^^^^^^^^^^^ +Next, let's configure our environment. + +#. Open a command line environment via cmd.exe. +#. Activate ``MSVC`` via below command:: + + "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat" +#. Create and activate a virtual environment: :: + +#. Install `PyTorch 2.5 `_ or later for CPU Usage. Install PyTorch 2.7 or later refer to `Getting Started on Intel GPU `_ for XPU usage. + +#. Here is an example of how to use TorchInductor on Windows: + + .. code-block:: python + + import torch + device="cpu" # or "xpu" for XPU + def foo(x, y): + a = torch.sin(x) + b = torch.cos(x) + return a + b + opt_foo1 = torch.compile(foo) + print(opt_foo1(torch.randn(10, 10).to(device), torch.randn(10, 10).to(device))) + +#. Below is the output of the above example:: + + tensor([[-3.9074e-02, 1.3994e+00, 1.3894e+00, 3.2630e-01, 8.3060e-01, + 1.1833e+00, 1.4016e+00, 7.1905e-01, 9.0637e-01, -1.3648e+00], + [ 1.3728e+00, 7.2863e-01, 8.6888e-01, -6.5442e-01, 5.6790e-01, + 5.2025e-01, -1.2647e+00, 1.2684e+00, -1.2483e+00, -7.2845e-01], + [-6.7747e-01, 1.2028e+00, 1.1431e+00, 2.7196e-02, 5.5304e-01, + 6.1945e-01, 4.6654e-01, -3.7376e-01, 9.3644e-01, 1.3600e+00], + [-1.0157e-01, 7.7200e-02, 1.0146e+00, 8.8175e-02, -1.4057e+00, + 8.8119e-01, 6.2853e-01, 3.2773e-01, 8.5082e-01, 8.4615e-01], + [ 1.4140e+00, 1.2130e+00, -2.0762e-01, 3.3914e-01, 4.1122e-01, + 8.6895e-01, 5.8852e-01, 9.3310e-01, 1.4101e+00, 9.8318e-01], + [ 1.2355e+00, 7.9290e-02, 1.3707e+00, 1.3754e+00, 1.3768e+00, + 9.8970e-01, 1.1171e+00, -5.9944e-01, 1.2553e+00, 1.3394e+00], + [-1.3428e+00, 1.8400e-01, 1.1756e+00, -3.0654e-01, 9.7973e-01, + 1.4019e+00, 1.1886e+00, -1.9194e-01, 1.3632e+00, 1.1811e+00], + [-7.1615e-01, 4.6622e-01, 1.2089e+00, 9.2011e-01, 1.0659e+00, + 9.0892e-01, 1.1932e+00, 1.3888e+00, 1.3898e+00, 1.3218e+00], + [ 1.4139e+00, -1.4000e-01, 9.1192e-01, 3.0175e-01, -9.6432e-01, + -1.0498e+00, 1.4115e+00, -9.3212e-01, -9.0964e-01, 1.0127e+00], + [ 5.7244e-04, 1.2799e+00, 1.3595e+00, 1.0907e+00, 3.7191e-01, + 1.4062e+00, 1.3672e+00, 6.8502e-02, 8.5216e-01, 8.6046e-01]]) + +Alternative Compiler for better performance on CPU +-------------------------------------------------- + +To enhance performance for inductor on Windows CPU, you can use the Intel Compiler or LLVM Compiler. However, they rely on the runtime libraries from Microsoft Visual C++ (MSVC). Therefore, your first step should be to install MSVC. + +Intel Compiler +^^^^^^^^^^^^^^ + +#. Download and install `Intel Compiler `_ with Windows version. +#. Set Windows Inductor Compiler via environment variable ``set CXX=icx-cl``. + +LLVM Compiler +^^^^^^^^^^^^^ + +#. Download and install `LLVM Compiler `_ and choose win64 version. +#. Set Windows Inductor Compiler via environment variable ``set CXX=clang-cl``. + +Conclusion +---------- + +In this tutorial, we introduce how to use Inductor on Windows CPU with PyTorch 2.5 or later, and on Windows XPU with PyTorch 2.7 or later. We can also use Intel Compiler or LLVM Compiler to get better performance on CPU. diff --git a/unstable_source/inductor_windows_cpu.rst b/unstable_source/inductor_windows_cpu.rst new file mode 100644 index 00000000000..24ce55a82f9 --- /dev/null +++ b/unstable_source/inductor_windows_cpu.rst @@ -0,0 +1,7 @@ +This tutorial has been moved to https://pytorch.org/tutorials/prototype/inductor_windows.html. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/unstable_source/ios_coreml_workflow.rst b/unstable_source/ios_coreml_workflow.rst new file mode 100644 index 00000000000..db9abcc5076 --- /dev/null +++ b/unstable_source/ios_coreml_workflow.rst @@ -0,0 +1,10 @@ +(Prototype) Convert Mobilenetv2 to Core ML +========================================== + +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/unstable_source/ios_gpu_workflow.rst b/unstable_source/ios_gpu_workflow.rst new file mode 100644 index 00000000000..8915e1c4fad --- /dev/null +++ b/unstable_source/ios_gpu_workflow.rst @@ -0,0 +1,10 @@ +(Prototype) Use iOS GPU in PyTorch +================================== + +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/unstable_source/lite_interpreter.rst b/unstable_source/lite_interpreter.rst new file mode 100644 index 00000000000..73e950d72e2 --- /dev/null +++ b/unstable_source/lite_interpreter.rst @@ -0,0 +1,9 @@ +(Prototype) Introduce lite interpreter workflow in Android and iOS +======================= + +This tutorial has been moved to https://pytorch.org/tutorials/recipes/mobile_interpreter.html + + +.. raw:: html + + diff --git a/prototype_source/maskedtensor_adagrad.py b/unstable_source/maskedtensor_adagrad.py similarity index 99% rename from prototype_source/maskedtensor_adagrad.py rename to unstable_source/maskedtensor_adagrad.py index 445da1e0e28..d4eca31c5cb 100644 --- a/prototype_source/maskedtensor_adagrad.py +++ b/unstable_source/maskedtensor_adagrad.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -(Prototype) Efficiently writing "sparse" semantics for Adagrad with MaskedTensor +Efficiently writing "sparse" semantics for Adagrad with MaskedTensor ================================================================================ """ diff --git a/prototype_source/maskedtensor_advanced_semantics.py b/unstable_source/maskedtensor_advanced_semantics.py similarity index 99% rename from prototype_source/maskedtensor_advanced_semantics.py rename to unstable_source/maskedtensor_advanced_semantics.py index 7a023304218..3517691611d 100644 --- a/prototype_source/maskedtensor_advanced_semantics.py +++ b/unstable_source/maskedtensor_advanced_semantics.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -(Prototype) MaskedTensor Advanced Semantics +MaskedTensor Advanced Semantics =========================================== """ diff --git a/prototype_source/maskedtensor_overview.py b/unstable_source/maskedtensor_overview.py similarity index 99% rename from prototype_source/maskedtensor_overview.py rename to unstable_source/maskedtensor_overview.py index 28828693674..955268e0d76 100644 --- a/prototype_source/maskedtensor_overview.py +++ b/unstable_source/maskedtensor_overview.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -(Prototype) MaskedTensor Overview +MaskedTensor Overview ********************************* """ diff --git a/prototype_source/maskedtensor_sparsity.py b/unstable_source/maskedtensor_sparsity.py similarity index 99% rename from prototype_source/maskedtensor_sparsity.py rename to unstable_source/maskedtensor_sparsity.py index 1985135714e..a1353805f1d 100644 --- a/prototype_source/maskedtensor_sparsity.py +++ b/unstable_source/maskedtensor_sparsity.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -(Prototype) MaskedTensor Sparsity +MaskedTensor Sparsity ================================= """ diff --git a/unstable_source/max_autotune_on_CPU_tutorial.rst b/unstable_source/max_autotune_on_CPU_tutorial.rst new file mode 100644 index 00000000000..47374744938 --- /dev/null +++ b/unstable_source/max_autotune_on_CPU_tutorial.rst @@ -0,0 +1,215 @@ +Using Max-Autotune Compilation on CPU for Better Performance +================================================================================ + +**Author**: `Jiong Gong `__, `Leslie Fang `__, `Chunyuan Wu `__ + +In this tutorial, you will learn how to boost your PyTorch models' performance on CPU by +leveraging the max-autotune mode in the Inductor CPU backend. Explore the activation +process, understand the differences from traditional methods, and integrate max-autotune +into your code for enhanced computational efficiency. Dive into the use of advanced +GEMM templates for faster processing and superior runtime performance. + +Prerequisites: +---------------- +- `torch.compile and TorchInductor concepts in PyTorch `__ + +Introduction +------------ +The ``max-autotune`` mode for the Inductor CPU backend in ``torch.compile`` (`RFC link `_) +profiles multiple implementations of operations at compile time and selects the best-performing one, +trading longer compilation times for improved runtime performance. This enhancement is particularly beneficial for GEMM-related operations. +In the Inductor CPU backend, we’ve introduced a C++ template-based GEMM implementation as an alternative to the ATen-based approach that relies on oneDNN and MKL libraries. +This is similar to the max-autotune mode on CUDA, where implementations from ATen, Triton, and CUTLASS are considered. + +We have covered most popular data types, including FP32, BF16, FP16, and INT8, with epilogue fusions for x86 CPUs. + +While the development is still in progress, we have already seen promising speedups over pure ATen-based GEMMs as measured by the three benchmark suites and the inference of LLMs. + +Activating the ``max-autotune`` mode +------------------------------------- +To activate the ``max-autotune`` mode in PyTorch, set the ``mode`` argument to ``max-autotune`` when compiling your model using ``torch.compile``. +If you prefer to bypass the tuning process and always use the C++ template implementations, you can configure this via an environment variable: +``export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=CPP``. + + +Example +------------ +The below code is an example of using the ``max-autotune`` mode on a simple neural network with a linear layer followed by a ReLU activation. + +In the C++ template-based GEMM implementation, we will pre-pack the weight for good cache usage. +In the case of inference which is the primary scenario of CPU AI workloads, +model weights are constant and we pack them upfront during compilation +so that the data accesses are contiguous within the cache blocks. +Thus, We only support frozen model with ``torch.no_grad`` or the inference mode. +You need to set the environment variable ``export TORCHINDUCTOR_FREEZING=1`` +and ensure that both the compilation and inference steps are executed within the ``torch.no_grad`` context. + +.. code:: python + + import torch + from torch._inductor import config + config.trace.log_autotuning_results = True # enable the log of autotuning results + + class M(torch.nn.Module): + def __init__( + self, + in_features, + out_features, + bias, + **kwargs, + ): + super().__init__() + self.linear = torch.nn.Linear( + in_features, + out_features, + bias, + **kwargs, + ) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.linear(x) + x = self.relu(x) + return x + + amp_enabled = True + batch_size = 64 + in_features = 16 + out_features = 32 + bias = True + + x = torch.randn(batch_size, in_features) + model = M(in_features, out_features, bias) + + with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + compiled = torch.compile(model, mode="max-autotune") # turn on "max-autotune" mode + y = compiled(x) + + +When running the above code snippet, you will see the autotuning result (the performance numbers are for demonstration purposes). +In this example, C++ template outperforms ATen kernel so that it will be selected. + +.. code:: shell + + AUTOTUNE linear_unary(64x16, 32x16, 32) + cpp_packed_gemm_0 0.2142 ms 100.0% + _linear_pointwise 0.2441 ms 87.7% + + +We could check the generated output code by setting ``export TORCH_LOGS="+output_code"``. +When C++ template is selected, we won't have ``torch.ops.mkldnn._linear_pointwise.default`` (for bfloat16) or ``torch.ops.mkl._mkl_linear.default`` (for float32) +in the generated code anymore, instead, we'll find kernel based on CPP GEMM template ``cpp_fused__to_copy_relu_1`` +(only part of the code is demonstrated below for simplicity) with the bias and relu epilogues fused inside the C++ GEMM template kernel. + +The generated code differs by CPU architecture and is implementation-specific, which is subject to change. + +.. code:: python + + cpp_fused__to_copy_relu_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*'], ''' + + ... + + template + inline void kernel_micro_gemm_amx_kernel_32_2( + AMXState& amx_state, + const bfloat16* __restrict__ A, + const bfloat16* __restrict__ B, + float* __restrict__ C, + int64_t K, + int64_t lda, + int64_t ldb, + int64_t ldc, + uint8_t tilecfg_rows + ) { + ... + } + + ... + + template + inline void kernel_micro_gemm( + AMXState& amx_state, + const bfloat16* __restrict__ A, + const bfloat16* __restrict__ B, + float* __restrict__ C, + int64_t M, + int64_t N, + int64_t K, + int64_t lda, + int64_t ldb, + int64_t ldc + ) { + ... + } + + extern "C" + void kernel(const bfloat16* X, const bfloat16* W, const bfloat16* inp, bfloat16* Y) + { + constexpr int64_t num_threads = 40; + constexpr int64_t N = 32; + constexpr int64_t K = 16; + constexpr int64_t M = static_cast(64L); + ... + #pragma omp parallel num_threads(40) + { + const int tid = omp_get_thread_num(); + ... + for (int64_t mc_block_id = 0; mc_block_id < num_Mc_blocks_per_thread; mc_block_id++) { + ... + for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) { + ... + for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) { + ... + for (int64_t nci = nc; nci < nc_block_end; nci++) { + if (kc == k_block_start) { + kernel_micro_gemm(false)>( + ... + ); + + } else { + kernel_micro_gemm(true)>( + ... + ); + + } + } + } + { + { + // Epilogue fusion here for bias and relu + #pragma GCC ivdep + for(int64_t x0=static_cast(0L); x0(m_end + ((-1L)*m_start)); x0+=static_cast(1L)) + { + for(int64_t x1=static_cast(0L); x1(16L*(c10::div_floor_integer(static_cast((n_end + ((-1L)*n_start))), static_cast(16L)))); x1+=static_cast(16L)) + { + auto tmp0 = at::vec::Vectorized::loadu(inp + static_cast(n_start + x1), static_cast(16)); + auto tmp2 = at::vec::Vectorized::loadu(local_acc_buf + static_cast(x1 + (Nc_blocks*Nr*x0)), static_cast(16)); + auto tmp1 = at::vec::convert(tmp0); + auto tmp3 = tmp1 + tmp2; + auto tmp4 = at::vec::convert(tmp3); + auto tmp5 = static_cast(0.0); + auto tmp6 = at::vec::Vectorized(tmp5); + auto tmp7 = at::vec::maximum(tmp3, tmp6); + auto tmp8 = at::vec::convert(tmp7); + tmp8.store(Y + static_cast(n_start + x1 + (32L*m_start) + (32L*x0)), static_cast(16)); + } + + ... + + } + } + + } + } + } + ... + } + } + ''') + +Conclusion +------------ +In this tutorial, we introduced max-autotune support on CPU with GEMM template. We explained the API to activate this feature, and demonstrated +the generated code of the GEMM template. + +This feature is in prototype stage. If you have any feature requests or run into any issues, please file a bug report at `GitHub issues `_. diff --git a/prototype_source/nestedtensor.py b/unstable_source/nestedtensor.py similarity index 98% rename from prototype_source/nestedtensor.py rename to unstable_source/nestedtensor.py index ecf099c1e02..77f8a4cebe1 100644 --- a/prototype_source/nestedtensor.py +++ b/unstable_source/nestedtensor.py @@ -369,3 +369,8 @@ def benchmark(func, *args, **kwargs): # how implement multi-head attention for transformers in a way that avoids computation on padding. # For more information, check out the docs for the # `torch.nested `__ namespace. +# +# See Also +# -------- +# +# * `Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile `__ diff --git a/unstable_source/nnapi_mobilenetv2.rst b/unstable_source/nnapi_mobilenetv2.rst new file mode 100644 index 00000000000..ef7edc92d12 --- /dev/null +++ b/unstable_source/nnapi_mobilenetv2.rst @@ -0,0 +1,10 @@ +(Beta) Convert MobileNetV2 to NNAPI +======================================== + +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/unstable_source/openvino_quantizer.rst b/unstable_source/openvino_quantizer.rst new file mode 100644 index 00000000000..9412c772204 --- /dev/null +++ b/unstable_source/openvino_quantizer.rst @@ -0,0 +1,250 @@ +PyTorch 2 Export Quantization for OpenVINO torch.compile Backend +=========================================================================== + +**Authors**: `Daniil Lyakhov `_, `Aamir Nazir `_, `Alexander Suslov `_, `Yamini Nimmagadda `_, `Alexander Kozlov `_ + +Prerequisites +-------------- +- `PyTorch 2 Export Post Training Quantization `_ +- `How to Write a Quantizer for PyTorch 2 Export Quantization `_ + +Introduction +-------------- + +.. note:: + + This is an experimental feature, the quantization API is subject to change. + +This tutorial demonstrates how to use ``OpenVINOQuantizer`` from `Neural Network Compression Framework (NNCF) `_ in PyTorch 2 Export Quantization flow to generate a quantized model customized for the `OpenVINO torch.compile backend `_ and explains how to lower the quantized model into the `OpenVINO `_ representation. +``OpenVINOQuantizer`` unlocks the full potential of low-precision OpenVINO kernels due to the placement of quantizers designed specifically for the OpenVINO. + +The PyTorch 2 export quantization flow uses ``torch.export`` to capture the model into a graph and performs quantization transformations on top of the ATen graph. +This approach is expected to have significantly higher model coverage, improved flexibility, and a simplified UX. +OpenVINO backend compiles the FX Graph generated by TorchDynamo into an optimized OpenVINO model. + +The quantization flow mainly includes four steps: + +- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. +- Step 2: Apply the PyTorch 2 Export Quantization flow with OpenVINOQuantizer based on the captured FX Graph. +- Step 3: Lower the quantized model into OpenVINO representation with the `torch.compile `_ API. +- Optional step 4: : Improve quantized model metrics via `quantize_pt2e `_ method. + +The high-level architecture of this flow could look like this: + +:: + + float_model(Python) Example Input + \ / + \ / + —-------------------------------------------------------- + | export | + —-------------------------------------------------------- + | + FX Graph in ATen + | + | OpenVINOQuantizer + | / + —-------------------------------------------------------- + | prepare_pt2e | + | | | + | Calibrate + | | | + | convert_pt2e | + —-------------------------------------------------------- + | + Quantized Model + | + —-------------------------------------------------------- + | Lower into Inductor | + —-------------------------------------------------------- + | + OpenVINO model + +Post Training Quantization +---------------------------- + +Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_ +for post training quantization. + +Prerequisite: OpenVINO and NNCF installation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OpenVINO and NNCF could be easily installed via `pip distribution `_: + +.. code-block:: bash + + pip install -U pip + pip install openvino, nncf + + +1. Capture FX Graph +^^^^^^^^^^^^^^^^^^^^^ + +We will start by performing the necessary imports, capturing the FX Graph from the eager module. + +.. code-block:: python + + import copy + import openvino.torch + import torch + import torchvision.models as models + from torch.ao.quantization.quantize_pt2e import convert_pt2e + from torch.ao.quantization.quantize_pt2e import prepare_pt2e + + import nncf.torch + + # Create the Eager Model + model_name = "resnet18" + model = models.__dict__[model_name](pretrained=True) + + # Set the model to eval mode + model = model.eval() + + # Create the data, using the dummy data here as an example + traced_bs = 50 + x = torch.randn(traced_bs, 3, 224, 224) + example_inputs = (x,) + + # Capture the FX Graph to be quantized + with torch.no_grad(), nncf.torch.disable_patching(): + exported_model = torch.export.export(model, example_inputs).module() + + + +2. Apply Quantization +^^^^^^^^^^^^^^^^^^^^^^^ + +After we capture the FX Module to be quantized, we will import the OpenVINOQuantizer. + + +.. code-block:: python + + from nncf.experimental.torch.fx import OpenVINOQuantizer + + quantizer = OpenVINOQuantizer() + +``OpenVINOQuantizer`` has several optional parameters that allow tuning the quantization process to get a more accurate model. +Below is the list of essential parameters and their description: + + +* ``preset`` - defines quantization scheme for the model. Two types of presets are available: + + * ``PERFORMANCE`` (default) - defines symmetric quantization of weights and activations + + * ``MIXED`` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation functions, e.g. ELU, PReLU, GELU, etc. + + .. code-block:: python + + OpenVINOQuantizer(preset=nncf.QuantizationPreset.MIXED) + +* ``model_type`` - used to specify quantization scheme required for specific type of the model. Transformer is the only supported special quantization scheme to preserve accuracy after quantization of Transformer models (BERT, Llama, etc.). None is default, i.e. no specific scheme is defined. + + .. code-block:: python + + OpenVINOQuantizer(model_type=nncf.ModelType.Transformer) + +* ``ignored_scope`` - this parameter can be used to exclude some layers from the quantization process to preserve the model accuracy. For example, when you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: + + .. code-block:: python + + #Exclude by layer name: + names = ['layer_1', 'layer_2', 'layer_3'] + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(names=names)) + + #Exclude by layer type: + types = ['Conv2d', 'Linear'] + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(types=types)) + + #Exclude by regular expression: + regex = '.*layer_.*' + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(patterns=regex)) + + #Exclude by subgraphs: + # In this case, all nodes along all simple paths in the graph + # from input to output nodes will be excluded from the quantization process. + subgraph = nncf.Subgraph(inputs=['layer_1', 'layer_2'], outputs=['layer_3']) + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(subgraphs=[subgraph])) + + +* ``target_device`` - defines the target device, the specificity of which will be taken into account during optimization. The following values are supported: ``ANY`` (default), ``CPU``, ``CPU_SPR``, ``GPU``, and ``NPU``. + + .. code-block:: python + + OpenVINOQuantizer(target_device=nncf.TargetDevice.CPU) + +For further details on `OpenVINOQuantizer` please see the `documentation `_. + +After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. +``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. + +.. code-block:: python + + prepared_model = prepare_pt2e(exported_model, quantizer) + +Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. + +.. code-block:: python + + # We use the dummy data as an example here + prepared_model(*example_inputs) + +Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. + +.. code-block:: python + + quantized_model = convert_pt2e(prepared_model, fold_quantize=False) + +After these steps, we finished running the quantization flow, and we will get the quantized model. + + +3. Lower into OpenVINO representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +After that the FX Graph can utilize OpenVINO optimizations using `torch.compile(…, backend=”openvino”) `_ functionality. + +.. code-block:: python + + with torch.no_grad(), nncf.torch.disable_patching(): + optimized_model = torch.compile(quantized_model, backend="openvino") + + # Running some benchmark + optimized_model(*example_inputs) + + + +The optimized model is using low-level kernels designed specifically for Intel CPU. +This should significantly speed up inference time in comparison with the eager model. + +4. Optional: Improve quantized model metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +NNCF implements advanced quantization algorithms like `SmoothQuant `_ and `BiasCorrection `_, which help +to improve the quantized model metrics while minimizing the output discrepancies between the original and compressed models. +These advanced NNCF algorithms can be accessed via the NNCF `quantize_pt2e` API: + +.. code-block:: python + + from nncf.experimental.torch.fx import quantize_pt2e + + calibration_loader = torch.utils.data.DataLoader(...) + + + def transform_fn(data_item): + images, _ = data_item + return images + + + calibration_dataset = nncf.Dataset(calibration_loader, transform_fn) + quantized_model = quantize_pt2e( + exported_model, quantizer, calibration_dataset, smooth_quant=True, fast_bias_correction=False + ) + + +For further details, please see the `documentation `_ +and a complete `example on Resnet18 quantization `_. + +Conclusion +------------ + +This tutorial introduces how to use torch.compile with the OpenVINO backend and the OpenVINO quantizer. +For more details on NNCF and the NNCF Quantization Flow for PyTorch models, refer to the `NNCF Quantization Guide `_. +For additional information, check out the `OpenVINO Deployment via torch.compile Documentation `_. diff --git a/unstable_source/python_extension_autoload.rst b/unstable_source/python_extension_autoload.rst new file mode 100644 index 00000000000..ee7af5d49ef --- /dev/null +++ b/unstable_source/python_extension_autoload.rst @@ -0,0 +1,184 @@ +Autoloading Out-of-Tree Extension +================================= + +**Author:** `Yuanhao Ji `__ + +The extension autoloading mechanism enables PyTorch to automatically +load out-of-tree backend extensions without explicit import statements. This +feature is beneficial for users as it enhances their +experience and enables them to follow the familiar PyTorch device +programming model without having to explicitly load or import device-specific +extensions. Additionally, it facilitates effortless +adoption of existing PyTorch applications with zero-code changes on +out-of-tree devices. For further details, refer to the +`[RFC] Autoload Device Extension `_. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to use out-of-tree extension autoloading in PyTorch + * Review examples with Intel Gaudi HPU, Huawei Ascend NPU + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.5 or later + +.. note:: + + This feature is enabled by default and can be disabled by using + ``export TORCH_DEVICE_BACKEND_AUTOLOAD=0``. + If you get an error like this: "Failed to load the backend extension", + this error is independent with PyTorch, you should disable this feature + and ask the out-of-tree extension maintainer for help. + +How to apply this mechanism to out-of-tree extensions? +------------------------------------------------------ + +For instance, suppose you have a backend named ``foo`` and a corresponding package named ``torch_foo``. Ensure that +your package is compatible with PyTorch 2.5 or later and includes the following snippet in its ``__init__.py`` file: + +.. code-block:: python + + def _autoload(): + print("Check things are working with `torch.foo.is_available()`.") + +Then, the only thing you need to do is define an entry point within your Python package: + +.. code-block:: python + + setup( + name="torch_foo", + version="1.0", + entry_points={ + "torch.backends": [ + "torch_foo = torch_foo:_autoload", + ], + } + ) + +Now you can import the ``torch_foo`` module by simply adding the ``import torch`` statement without the need to add ``import torch_foo``: + +.. code-block:: python + + >>> import torch + Check things are working with `torch.foo.is_available()`. + >>> torch.foo.is_available() + True + +In some cases, you might encounter issues with circular imports. The examples below demonstrate how you can address them. + +Examples +^^^^^^^^ + +In this example, we will be using Intel Gaudi HPU and Huawei Ascend NPU to determine how to +integrate your out-of-tree extension with PyTorch using the autoloading feature. + +`habana_frameworks.torch`_ is a Python package that enables users to run +PyTorch programs on Intel Gaudi by using the PyTorch ``HPU`` device key. + +.. _habana_frameworks.torch: https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html + +``habana_frameworks.torch`` is a submodule of ``habana_frameworks``, we add an entry point to +``__autoload()`` in ``habana_frameworks/setup.py``: + +.. code-block:: diff + + setup( + name="habana_frameworks", + version="2.5", + + entry_points={ + + 'torch.backends': [ + + "device_backend = habana_frameworks:__autoload", + + ], + + } + ) + +In ``habana_frameworks/init.py``, we use a global variable to track if our module has been loaded: + +.. code-block:: python + + import os + + is_loaded = False # A member variable of habana_frameworks module to track if our module has been imported + + def __autoload(): + # This is an entrypoint for pytorch autoload mechanism + # If the following condition is true, that means our backend has already been loaded, either explicitly + # or by the autoload mechanism and importing it again should be skipped to avoid circular imports + global is_loaded + if is_loaded: + return + import habana_frameworks.torch + +In ``habana_frameworks/torch/init.py``, we prevent circular imports by updating the state of the global variable: + +.. code-block:: python + + import os + + # This is to prevent torch autoload mechanism from causing circular imports + import habana_frameworks + + habana_frameworks.is_loaded = True + +`torch_npu`_ enables users to run PyTorch programs on Huawei Ascend NPU, it +leverages the ``PrivateUse1`` device key and exposes the device name +as ``npu`` to the end users. + +.. _torch_npu: https://github.com/Ascend/pytorch + +We define an entry point in `torch_npu/setup.py`_: + +.. _torch_npu/setup.py: https://github.com/Ascend/pytorch/blob/master/setup.py#L618 + +.. code-block:: diff + + setup( + name="torch_npu", + version="2.5", + + entry_points={ + + 'torch.backends': [ + + 'torch_npu = torch_npu:_autoload', + + ], + + } + ) + +Unlike ``habana_frameworks``, ``torch_npu`` uses the environment variable ``TORCH_DEVICE_BACKEND_AUTOLOAD`` +to control the autoloading process. For example, we set it to ``0`` to disable autoloading to prevent circular imports: + +.. code-block:: python + + # Disable autoloading before running 'import torch' + os.environ['TORCH_DEVICE_BACKEND_AUTOLOAD'] = '0' + + import torch + +How it works +------------ + +.. image:: ../_static/img/python_extension_autoload_impl.png + :alt: Autoloading implementation + :align: center + +Autoloading is implemented based on Python's `Entrypoints +`_ +mechanism. We discover and load all of the specific entry points +in ``torch/__init__.py`` that are defined by out-of-tree extensions. + +As shown above, after installing ``torch_foo``, your Python module can be imported +when loading the entrypoint that you have defined, and then you can do some necessary work when +calling it. + +See the implementation in this pull request: `[RFC] Add support for device extension autoloading +`_. + +Conclusion +---------- + +In this tutorial, we learned about the out-of-tree extension autoloading mechanism in PyTorch, which automatically +loads backend extensions eliminating the need to add additional import statements. We also learned how to apply +this mechanism to out-of-tree extensions by defining an entry point and how to prevent circular imports. +We also reviewed an example on how to use the autoloading mechanism with Intel Gaudi HPU and Huawei Ascend NPU. diff --git a/unstable_source/tracing_based_selective_build.rst b/unstable_source/tracing_based_selective_build.rst new file mode 100644 index 00000000000..a1b56072051 --- /dev/null +++ b/unstable_source/tracing_based_selective_build.rst @@ -0,0 +1,10 @@ +(prototype) Tracing-based Selective Build Mobile Interpreter in Android and iOS +=============================================================================== + +This tutorial has been replaced with a newer tutorial on this topic: https://pytorch.org/executorch/stable/kernel-library-selective-build.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/prototype_source/vmap_recipe.py b/unstable_source/vmap_recipe.py similarity index 100% rename from prototype_source/vmap_recipe.py rename to unstable_source/vmap_recipe.py