Skip to content

Commit

Permalink
Migrate load clip to all clip module (#328)
Browse files Browse the repository at this point in the history
* Migrate load clip to all clip module

* Add related projects section in readme
  • Loading branch information
rom1504 authored Jan 7, 2024
1 parent d622697 commit 0fb54aa
Show file tree
Hide file tree
Showing 9 changed files with 22 additions and 255 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ Also see [laion5B](https://laion.ai/laion-5b-a-new-era-of-open-large-scale-multi

If you believe in making reusable tools to make data easy to use for ML and you would like to contribute, please join the [DataToML](https://discord.gg/ep8yUUtCnp) chat.

## Related projects

* [all_clip](https://github.com/rom1504/all_clip) to load any clip model
* [img2dataset](https://github.com/rom1504/img2dataset) to download images from urls
* [open_clip](https://github.com/mlfoundations/open_clip) to train clip models
* [CLIP_benchmark](https://github.com/LAION-AI/CLIP_benchmark) to evaluate clip models

## Who is using clip retrieval ?

* [cah-prepro](https://github.com/rom1504/cah-prepro) preprocess the 400M image+text crawling at home dataset. clip-retrieval is used to compute 400M clip embeddings and the indices
Expand Down
6 changes: 2 additions & 4 deletions clip_retrieval/clip_back.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,10 @@ def encode_texts(text):
def load_clip_index(clip_options):
"""load the clip index"""
import torch # pylint: disable=import-outside-toplevel
from clip_retrieval.load_clip import load_clip, get_tokenizer # pylint: disable=import-outside-toplevel
from all_clip import load_clip # pylint: disable=import-outside-toplevel

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = load_clip(clip_options.clip_model, use_jit=clip_options.use_jit, device=device)

tokenizer = get_tokenizer(clip_options.clip_model)
model, preprocess, tokenizer = load_clip(clip_options.clip_model, use_jit=clip_options.use_jit, device=device)

if clip_options.enable_mclip_option:
model_txt_mclip = load_mclip(clip_options.clip_model)
Expand Down
4 changes: 2 additions & 2 deletions clip_retrieval/clip_inference/mapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""mapper module transform images and text to embeddings"""

import torch
from clip_retrieval.load_clip import load_clip
from all_clip import load_clip
from sentence_transformers import SentenceTransformer


Expand Down Expand Up @@ -33,7 +33,7 @@ def __init__(
self.enable_metadata = enable_metadata
self.use_mclip = use_mclip
self.device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = load_clip(
model, _, _ = load_clip(
clip_model=clip_model,
use_jit=use_jit,
warmup_batch_size=warmup_batch_size,
Expand Down
4 changes: 2 additions & 2 deletions clip_retrieval/clip_inference/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from clip_retrieval.clip_inference.writer import NumpyWriter
from clip_retrieval.clip_inference.logger import LoggerWriter
from clip_retrieval.clip_inference.reader import FilesReader, WebdatasetReader
from clip_retrieval.load_clip import load_clip
from all_clip import load_clip


def worker(
Expand Down Expand Up @@ -49,7 +49,7 @@ def worker(
print(f"dataset is {len(input_dataset)}", flush=True)

def reader_builder(sampler):
_, preprocess = load_clip(
_, preprocess, _ = load_clip(
clip_model=clip_model,
use_jit=use_jit,
warmup_batch_size=batch_size,
Expand Down
239 changes: 0 additions & 239 deletions clip_retrieval/load_clip.py

This file was deleted.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ multilingual-clip>=1.0.10,<2
transformers
urllib3<2
scipy<1.9.2
all_clip<2
8 changes: 4 additions & 4 deletions tests/test_clip_inference/playground.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@
"from clip_retrieval.clip_inference.runner import Sampler\n",
"import os\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
"from clip_retrieval.clip_inference.load_clip import load_clip\n",
"from all_clip import load_clip\n",
"images = \"test_images\"\n",
"tars = \"test_tars\"\n",
"folder = images\n",
"\n",
"batch_size=2\n",
"num_prepro_workers=2\n",
"_, preprocess = load_clip()\n"
"_, preprocess, _ = load_clip()\n"
]
},
{
Expand Down Expand Up @@ -323,7 +323,7 @@
"from clip_retrieval.clip_inference.reader import FilesReader, WebdatasetReader\n",
"from clip_retrieval.clip_inference.mapper import ClipMapper\n",
"from clip_retrieval.clip_inference.writer import NumpyWriter\n",
"from clip_retrieval.clip_inference.load_clip import load_clip\n",
"from all_clip import load_clip\n",
"import os\n",
"import numpy as np\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
Expand Down Expand Up @@ -398,7 +398,7 @@
"from clip_retrieval.clip_inference.reader import FilesReader, WebdatasetReader\n",
"from clip_retrieval.clip_inference.mapper import ClipMapper\n",
"from clip_retrieval.clip_inference.writer import NumpyWriter\n",
"from clip_retrieval.clip_inference.load_clip import load_clip\n",
"from all_clip import load_clip\n",
"from clip_retrieval.clip_inference.distributor import SequentialDistributor, PysparkDistributor\n",
"import os\n",
"import numpy as np\n",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_clip_inference/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from clip_retrieval.clip_inference.runner import Sampler
import os

from clip_retrieval.load_clip import load_clip
from all_clip import load_clip


@pytest.mark.parametrize("file_format", ["files", "webdataset"])
Expand All @@ -17,7 +17,7 @@ def test_reader(file_format):
input_dataset = [tar_folder + "/image1.tar", tar_folder + "/image2.tar"]
batch_size = 2
num_prepro_workers = 2
_, preprocess = load_clip(warmup_batch_size=batch_size)
_, preprocess, _ = load_clip(warmup_batch_size=batch_size)

output_partition_count = 2
actual_values = []
Expand Down
4 changes: 2 additions & 2 deletions tests/test_clip_inference/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from clip_retrieval.clip_inference.reader import FilesReader
from clip_retrieval.clip_inference.mapper import ClipMapper
from clip_retrieval.clip_inference.writer import NumpyWriter
from clip_retrieval.load_clip import load_clip
from all_clip import load_clip
import os
import numpy as np
import tempfile
Expand All @@ -21,7 +21,7 @@ def test_runner():
with tempfile.TemporaryDirectory() as tmpdir:

def reader_builder(sampler):
_, preprocess = load_clip(warmup_batch_size=batch_size)
_, preprocess, _ = load_clip(warmup_batch_size=batch_size)
return FilesReader(
sampler,
preprocess,
Expand Down

0 comments on commit 0fb54aa

Please sign in to comment.