merge

pytorch · Apr 10, 2024 · e3b4080 · e3b4080
2 parents d4c597a + ebb44e0
commit e3b4080
Show file tree

Hide file tree

Showing 9 changed files with 893 additions and 208 deletions.
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -11,7 +11,7 @@ jobs:
   run-tinystories:
     strategy:
       matrix:
-        runner: [ubuntu-latest, macos-12, macos-14]
+        runner: [ubuntu-latest, macos-14]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout repo
@@ -95,6 +95,17 @@ jobs:
           python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
+          echo "******************************************"
+          echo "******** INT4 group-wise quantized *******"
+          echo "******************************************"
+          python generate.py --quant '{"linear:int4" : {"group_size": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --compile --quant '{"linear:int4" : {"group_size": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          python export.py --quant '{"linear:int4" : {"group_size": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+
           echo "tests complete"
           echo "******************************************"
           # echo "********* EAGER vs TORCH.COMPILE *********"

diff --git a/.github/workflows/macos12.yml b/.github/workflows/macos12.yml
@@ -0,0 +1,79 @@
+name: Compile main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  run-tinystories:
+    strategy:
+      matrix:
+        runner: [macos-12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+      - name: Download checkpoints
+        run: |
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+      - name: Run inference
+        run: |          
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          # python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          # python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******* Emb: channel-wise quantized ******"
+          echo "******************************************"
+          python generate.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          echo "******************************************"
+          echo "******** Emb: group-wise quantized *******"
+          echo "******************************************"
+          python generate.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          echo "******************************************"
+          echo "******* INT8 channel-wise quantized ******"
+          echo "******************************************"
+          python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+	  
+          echo "******************************************"
+          echo "******** INT8 group-wise quantized *******"
+          echo "******************************************"
+          python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          echo "tests complete"
+          echo "******************************************"
diff --git a/.github/workflows/runner_et.yml b/.github/workflows/runner_et.yml
@@ -0,0 +1,65 @@
+name: Compile main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  run-tinystories:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          echo "Intalling pip packages"
+          pip install wheel
+          pip install cmake
+          pip install ninja
+          pip install zstd
+          pip install -r requirements.txt
+
+          export LLAMA_FAST_ROOT=${PWD}
+          export ET_NO_PYBIND=1
+          ./scripts/install_et.sh
+          cmake -S ./runner-et -B build/cmake-out -G Ninja
+          cmake --build ./build/cmake-out
+      - name: Download checkpoints
+        run: |
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
+          popd
+      - name: Run inference
+        run: |
+          export MODEL_DIR=${PWD}/checkpoints/stories15M
+          export PROMPT="Once upon a time in a land far away"
+
+          python generate.py --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}"  > ${PWD}/output_eager
+          cat ${PWD}/output_eager
+
+          python export.py --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-pte-path ${PWD}/stories15M.pte
+
+          ./build/cmake-out/runner_et ${PWD}/stories15M.pte -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_et
+          cat ${PWD}/output_et
+
+          echo "Tests complete."
diff --git a/README.md b/README.md
@@ -171,6 +171,11 @@ format:
 
 ```
 python utils/tokenizer.py --tokenizer-model=${MODEL_DIR}tokenizer.model
+```
+
+We will later disucss how to use this model, as described under *STANDALONE EXECUTION* in a Python-free
+environment:
+```
 ./run ${MODEL_OUT}/model.{so,pte} -z ${MODEL_OUT}/tokenizer.bin
 ```
 
@@ -273,7 +278,7 @@ quantization options.
 
 *Channelwise quantization*:
 
-The simplest way to quantize embedding tables is with int8 groupwise
+The simplest way to quantize embedding tables is with int8 "channelwise"
 quantization, where each value is represented by an 8 bit integer, and
 a floating point scale per group.