From 604cc4ad183201ef3907f31476dc2d6d245b35cb Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 13 Jan 2024 03:38:32 +0200 Subject: [PATCH] Adapt semantic-image-search-client to use SigLIP model --- .../package-lock.json | 17 +++++++++++++---- .../semantic-image-search-client/package.json | 2 +- .../src/app/worker.js | 16 ++++++++-------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/examples/semantic-image-search-client/package-lock.json b/examples/semantic-image-search-client/package-lock.json index 7c06d25f3..ee4488ae3 100644 --- a/examples/semantic-image-search-client/package-lock.json +++ b/examples/semantic-image-search-client/package-lock.json @@ -8,7 +8,7 @@ "name": "semantic-image-search-client", "version": "0.1.0", "dependencies": { - "@xenova/transformers": "^2.6.1", + "@xenova/transformers": "^2.14.0", "autoprefixer": "10.4.14", "blurhash": "^2.0.5", "eslint": "8.45.0", @@ -102,6 +102,14 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@huggingface/jinja": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.2.tgz", + "integrity": "sha512-x5mpbfJt1nKmVep5WNP5VjNsjWApWNj8pPYI+uYMkBWH9bWUJmQmHt2lbf0VCoQd54Oq3XuFEh/UyoVh7rPxmg==", + "engines": { + "node": ">=18" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.10", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.10.tgz", @@ -553,10 +561,11 @@ } }, "node_modules/@xenova/transformers": { - "version": "2.6.1", - "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.6.1.tgz", - "integrity": "sha512-fK1SkZUCvTdH1gEWmBUU5rvugZBqqu0ibkaBmUIr5t9Kf+Z8W4n0IszSRS2+M5ZHxRKS3SE7pFpsMDXByIzmQw==", + "version": "2.14.0", + "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.14.0.tgz", + "integrity": "sha512-rQ3O7SW5EM64b6XFZGx3XQ2cfiroefxUwU9ShfSpEZyhd082GvwNJJKndxgaukse1hZP1JUDoT0DfjDiq4IZiw==", "dependencies": { + "@huggingface/jinja": "^0.1.0", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, diff --git a/examples/semantic-image-search-client/package.json b/examples/semantic-image-search-client/package.json index 0b542d3cc..cc575faf5 100644 --- a/examples/semantic-image-search-client/package.json +++ b/examples/semantic-image-search-client/package.json @@ -9,7 +9,7 @@ "lint": "next lint" }, "dependencies": { - "@xenova/transformers": "^2.6.1", + "@xenova/transformers": "^2.14.0", "autoprefixer": "10.4.14", "blurhash": "^2.0.5", "eslint": "8.45.0", diff --git a/examples/semantic-image-search-client/src/app/worker.js b/examples/semantic-image-search-client/src/app/worker.js index 977edf846..a2beffdab 100644 --- a/examples/semantic-image-search-client/src/app/worker.js +++ b/examples/semantic-image-search-client/src/app/worker.js @@ -2,14 +2,14 @@ import { env, AutoTokenizer, CLIPTextModelWithProjection } from '@xenova/transformers'; import { getCachedFile, getCachedJSON } from './utils.js'; -const EMBED_DIM = 512; - +const EMBED_DIM = 768; +const DB_SIZE = 10; // or 25 // Skip local model check env.allowLocalModels = false; class ApplicationSingleton { - static model_id = 'Xenova/clip-vit-base-patch16'; - static BASE_URL = 'https://huggingface.co/datasets/Xenova/semantic-image-search-assets/resolve/main/'; + static model_id = 'Xenova/siglip-base-patch16-224'; + static BASE_URL = 'https://huggingface.co/datasets/Xenova/siglip-semantic-image-search-assets/resolve/main/'; static tokenizer = null; static text_model = null; @@ -25,12 +25,12 @@ class ApplicationSingleton { this.text_model = CLIPTextModelWithProjection.from_pretrained(this.model_id, { progress_callback }); } if (this.metadata === null) { - this.metadata = getCachedJSON(this.BASE_URL + 'image-embeddings.json'); + this.metadata = getCachedJSON(`${this.BASE_URL}metadata_${DB_SIZE}k.json`); } if (this.embeddings === null) { this.embeddings = new Promise( (resolve, reject) => { - getCachedFile(this.BASE_URL + 'image-embeddings_25k-512-32bit.bin') + getCachedFile(`${this.BASE_URL}image-embeddings_${DB_SIZE}k-768-32bit.bin`) .then((buffer) => { resolve(new Float32Array(buffer)); }) @@ -80,10 +80,10 @@ self.addEventListener('message', async (event) => { self.postMessage({ status: 'ready' }); // Run tokenization - const text_inputs = tokenizer(event.data.text, { padding: true, truncation: true }); + const text_inputs = tokenizer(event.data.text, { padding: 'max_length', truncation: true }); // Compute embeddings - const { text_embeds } = await text_model(text_inputs); + const { pooler_output: text_embeds } = await text_model(text_inputs); // Compute similarity scores const scores = cosineSimilarity(text_embeds.data, embeddings);