Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d86f40e

Browse filesBrowse files
montanalowlevkk
andauthored
mindsdb vs postgresml blog post (#704)
Co-authored-by: Lev Kokotov <levkk@users.noreply.github.com>
1 parent 4b921f3 commit d86f40e
Copy full SHA for d86f40e

File tree

10 files changed

+345
-34
lines changed
Filter options

10 files changed

+345
-34
lines changed

‎pgml-dashboard/src/api/docs.rs

Copy file name to clipboardExpand all lines: pgml-dashboard/src/api/docs.rs
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ async fn blog_handler<'a>(path: PathBuf, cluster: Cluster) -> Result<ResponseOk,
8080
cluster,
8181
&path,
8282
vec![
83+
NavLink::new("MindsDB vs PostgresML")
84+
.href("/blog/mindsdb-vs-postgresml"),
8385
NavLink::new("Introducing PostgresML Python SDK: Build End-to-End Vector Search Applications without OpenAI and Pinecone")
8486
.href("/blog/introducing-postgresml-python-sdk-build-end-to-end-vector-search-applications-without-openai-and-pinecone"),
8587
NavLink::new("PostgresML raises $4.7M to launch serverless AI application databases based on Postgres")

‎pgml-dashboard/static/blog/mindsdb-vs-postgresml.md

Copy file name to clipboardExpand all lines: pgml-dashboard/static/blog/mindsdb-vs-postgresml.md
+313Lines changed: 313 additions & 0 deletions
Large diffs are not rendered by default.
Loading
Binary file not shown.
145 KB
Loading

‎pgml-dashboard/templates/layout/nav/top.html

Copy file name to clipboardExpand all lines: pgml-dashboard/templates/layout/nav/top.html
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<a class="nav-link" href="/docs/guides/setup/quick_start_with_docker/">Docs</a>
2222
</li>
2323
<li class="nav-item d-flex align-items-center">
24-
<a class="nav-link" href="/blog/postgresml-raises-4.7M-to-launch-serverless-ai-application-databases-based-on-postgres">Blog</a>
24+
<a class="nav-link" href="/blog/mindsdb-vs-postgresml">Blog</a>
2525
</li>
2626
<li class="nav-item d-flex align-items-center">
2727
<a class="nav-link" href="https://github.com/postgresml/postgresml" target="_blank">Open Source</a>

‎pgml-extension/requirements.txt

Copy file name to clipboardExpand all lines: pgml-extension/requirements.txt
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ deepspeed==0.9.2
44
huggingface-hub==0.14.1
55
InstructorEmbedding==1.0.0
66
lightgbm==3.3.5
7+
orjson==3.9.0
78
pandas==2.0.1
89
rich==13.3.5
910
rouge==1.0.1

‎pgml-extension/src/api.rs

Copy file name to clipboardExpand all lines: pgml-extension/src/api.rs
+5-5Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ pub fn embed_batch(
577577
inputs: Vec<&str>,
578578
kwargs: default!(JsonB, "'{}'"),
579579
) -> Vec<Vec<f32>> {
580-
crate::bindings::transformers::embed(transformer, &inputs, &kwargs.0)
580+
crate::bindings::transformers::embed(transformer, inputs, &kwargs.0)
581581
}
582582

583583
#[pg_extern(immutable, parallel_safe)]
@@ -602,11 +602,11 @@ pub fn chunk(
602602
pub fn transform_json(
603603
task: JsonB,
604604
args: default!(JsonB, "'{}'"),
605-
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
605+
inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
606606
cache: default!(bool, false),
607607
) -> JsonB {
608608
JsonB(crate::bindings::transformers::transform(
609-
&task.0, &args.0, &inputs,
609+
&task.0, &args.0, inputs,
610610
))
611611
}
612612

@@ -616,14 +616,14 @@ pub fn transform_json(
616616
pub fn transform_string(
617617
task: String,
618618
args: default!(JsonB, "'{}'"),
619-
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
619+
inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
620620
cache: default!(bool, false),
621621
) -> JsonB {
622622
let mut task_map = HashMap::new();
623623
task_map.insert("task", task);
624624
let task_json = json!(task_map);
625625
JsonB(crate::bindings::transformers::transform(
626-
&task_json, &args.0, &inputs,
626+
&task_json, &args.0, inputs,
627627
))
628628
}
629629

‎pgml-extension/src/bindings/transformers.py

Copy file name to clipboardExpand all lines: pgml-extension/src/bindings/transformers.py
+20-24Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
import os
2-
import json
31
import math
2+
import os
43
import shutil
54
import time
6-
import numpy as np
75

86
import datasets
97
from InstructorEmbedding import INSTRUCTOR
8+
import numpy
9+
import orjson
1010
from rouge import Rouge
1111
from sacrebleu.metrics import BLEU
1212
from sentence_transformers import SentenceTransformer
@@ -42,7 +42,6 @@
4242
__cache_sentence_transformer_by_name = {}
4343
__cache_transform_pipeline_by_task = {}
4444

45-
4645
DTYPE_MAP = {
4746
"uint8": torch.uint8,
4847
"int8": torch.int8,
@@ -58,6 +57,10 @@
5857
"bool": torch.bool,
5958
}
6059

60+
def orjson_default(obj):
61+
if isinstance(obj, numpy.float32):
62+
return float(obj)
63+
raise TypeError
6164

6265
def convert_dtype(kwargs):
6366
if "torch_dtype" in kwargs:
@@ -78,18 +81,10 @@ def ensure_device(kwargs):
7881
else:
7982
kwargs["device"] = "cpu"
8083

81-
82-
class NumpyJSONEncoder(json.JSONEncoder):
83-
def default(self, obj):
84-
if isinstance(obj, np.float32):
85-
return float(obj)
86-
return super().default(obj)
87-
88-
8984
def transform(task, args, inputs):
90-
task = json.loads(task)
91-
args = json.loads(args)
92-
inputs = json.loads(inputs)
85+
task = orjson.loads(task)
86+
args = orjson.loads(args)
87+
inputs = orjson.loads(inputs)
9388

9489
key = ",".join([f"{key}:{val}" for (key, val) in sorted(task.items())])
9590
if key not in __cache_transform_pipeline_by_task:
@@ -103,17 +98,18 @@ def transform(task, args, inputs):
10398
pipe = __cache_transform_pipeline_by_task[key]
10499

105100
if pipe.task == "question-answering":
106-
inputs = [json.loads(input) for input in inputs]
101+
inputs = [orjson.loads(input) for input in inputs]
107102

108103
convert_eos_token(pipe.tokenizer, args)
109104

110-
return json.dumps(pipe(inputs, **args), cls=NumpyJSONEncoder)
105+
results = pipe(inputs, **args)
106+
107+
return orjson.dumps(results, default=orjson_default).decode()
111108

112109

113110
def embed(transformer, inputs, kwargs):
114-
115-
inputs = json.loads(inputs)
116-
kwargs = json.loads(kwargs)
111+
kwargs = orjson.loads(kwargs)
112+
117113
ensure_device(kwargs)
118114
instructor = transformer.startswith("hkunlp/instructor")
119115

@@ -137,7 +133,7 @@ def embed(transformer, inputs, kwargs):
137133

138134

139135
def load_dataset(name, subset, limit: None, kwargs: "{}"):
140-
kwargs = json.loads(kwargs)
136+
kwargs = orjson.loads(kwargs)
141137

142138
if limit:
143139
dataset = datasets.load_dataset(
@@ -164,7 +160,7 @@ def load_dataset(name, subset, limit: None, kwargs: "{}"):
164160
else:
165161
raise PgMLException(f"Unhandled dataset type: {type(dataset)}")
166162

167-
return json.dumps({"data": data, "types": types})
163+
return orjson.dumps({"data": data, "types": types}).decode()
168164

169165

170166
def tokenize_text_classification(tokenizer, max_length, x, y):
@@ -421,7 +417,7 @@ def compute_metrics_text_generation(model, tokenizer, hyperparams, y):
421417

422418

423419
def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
424-
hyperparams = json.loads(hyperparams)
420+
hyperparams = orjson.loads(hyperparams)
425421
model_name = hyperparams.pop("model_name")
426422
tokenizer = AutoTokenizer.from_pretrained(model_name)
427423

@@ -562,7 +558,7 @@ def generate(model_id, data, config):
562558
result = get_transformer_by_model_id(model_id)
563559
tokenizer = result["tokenizer"]
564560
model = result["model"]
565-
config = json.loads(config)
561+
config = orjson.loads(config)
566562
all_preds = []
567563

568564
batch_size = 1 # TODO hyperparams

‎pgml-extension/src/bindings/transformers.rs

Copy file name to clipboardExpand all lines: pgml-extension/src/bindings/transformers.rs
+3-4Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
2424
pub fn transform(
2525
task: &serde_json::Value,
2626
args: &serde_json::Value,
27-
inputs: &Vec<String>,
27+
inputs: Vec<&str>,
2828
) -> serde_json::Value {
2929
crate::bindings::venv::activate();
3030

3131
let task = serde_json::to_string(task).unwrap();
3232
let args = serde_json::to_string(args).unwrap();
33-
let inputs = serde_json::to_string(inputs).unwrap();
33+
let inputs = serde_json::to_string(&inputs).unwrap();
3434

3535
let results = Python::with_gil(|py| -> String {
3636
let transform: Py<PyAny> = PY_MODULE.getattr(py, "transform").unwrap().into();
@@ -56,11 +56,10 @@ pub fn transform(
5656
serde_json::from_str(&results).unwrap()
5757
}
5858

59-
pub fn embed(transformer: &str, inputs: &[&str], kwargs: &serde_json::Value) -> Vec<Vec<f32>> {
59+
pub fn embed(transformer: &str, inputs: Vec<&str>, kwargs: &serde_json::Value) -> Vec<Vec<f32>> {
6060
crate::bindings::venv::activate();
6161

6262
let kwargs = serde_json::to_string(kwargs).unwrap();
63-
let inputs = serde_json::to_string(&inputs).unwrap();
6463
Python::with_gil(|py| -> Vec<Vec<f32>> {
6564
let embed: Py<PyAny> = PY_MODULE.getattr(py, "embed").unwrap().into();
6665
embed

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.