Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 6998ae4

Browse filesBrowse files
authored
Dont materialize snapshots by default (#458)
1 parent 813d491 commit 6998ae4
Copy full SHA for 6998ae4

File tree

8 files changed

+127
-37
lines changed
Filter options

8 files changed

+127
-37
lines changed

‎pgml-dashboard/app/models.py

Copy file name to clipboardExpand all lines: pgml-dashboard/app/models.py
+18-11Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class Snapshot(models.Model):
6767
analysis = models.JSONField(null=True)
6868
created_at = models.DateTimeField(auto_now_add=True)
6969
updated_at = models.DateTimeField(auto_now=True)
70+
materialized = models.BooleanField()
7071

7172
class Meta:
7273
db_table = '"pgml"."snapshots"'
@@ -96,15 +97,6 @@ def table_name(self):
9697
return self.relation_name.split(".")[1]
9798
return self.relation_name
9899

99-
@property
100-
def table_type(self):
101-
with connection.cursor() as cursor:
102-
cursor.execute(
103-
f"SELECT pg_size_pretty(pg_total_relation_size(%s))",
104-
[self.snapshot_name],
105-
)
106-
return cursor.fetchone()[0]
107-
108100
@property
109101
def table_size(self):
110102
"""How big is the snapshot according to Postgres."""
@@ -114,7 +106,19 @@ def table_size(self):
114106
f"SELECT pg_size_pretty(pg_total_relation_size(%s))",
115107
[self.snapshot_name],
116108
)
117-
return cursor.fetchone()[0]
109+
size = cursor.fetchone()[0]
110+
return size
111+
# TODO:
112+
# Figure out if it's a view (not a table) and
113+
# report accordingly.
114+
#
115+
# This is not a good heuristic:
116+
#
117+
# if size == "0 bytes":
118+
# # Likely a view that doesn't report size
119+
# return "Unknown"
120+
# else:
121+
# return size
118122
except:
119123
return 0
120124

@@ -125,7 +129,10 @@ def feature_size(self):
125129

126130
@property
127131
def snapshot_name(self):
128-
return f"snapshot_{self.id}"
132+
if self.materialized:
133+
return f"snapshot_{self.id}"
134+
else:
135+
return self.relation_name
129136

130137

131138
class Model(models.Model):

‎pgml-extension/Cargo.lock

Copy file name to clipboardExpand all lines: pgml-extension/Cargo.lock
+1-1Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pgml-extension/Cargo.toml

Copy file name to clipboardExpand all lines: pgml-extension/Cargo.toml
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "pgml"
3-
version = "2.0.2"
3+
version = "2.1.0"
44
edition = "2021"
55

66
[lib]
+38Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
ALTER TABLE pgml.snapshots ADD COLUMN materialized BOOLEAN DEFAULT false;
2+
3+
DROP FUNCTION IF EXISTS pgml.train(
4+
project_name text,
5+
task pgml.task,
6+
relation_name text,
7+
y_column_name text,
8+
algorithm pgml.algorithm,
9+
hyperparams jsonb,
10+
search pgml.search,
11+
search_params jsonb,
12+
search_args jsonb,
13+
test_size real,
14+
test_sampling pgml.sampling,
15+
runtime pgml.runtime,
16+
automatic_deploy
17+
);
18+
19+
CREATE OR REPLACE FUNCTION pgml.train(
20+
project_name text,
21+
task pgml.task DEFAULT NULL::pgml.task,
22+
relation_name text DEFAULT NULL::text,
23+
y_column_name text DEFAULT NULL::text,
24+
algorithm pgml.algorithm DEFAULT 'linear'::pgml.algorithm,
25+
hyperparams jsonb DEFAULT '{}'::jsonb,
26+
search pgml.search DEFAULT NULL::pgml.search,
27+
search_params jsonb DEFAULT '{}'::jsonb,
28+
search_args jsonb DEFAULT '{}'::jsonb,
29+
test_size real DEFAULT 0.25,
30+
test_sampling pgml.sampling DEFAULT 'last'::pgml.sampling,
31+
runtime pgml.runtime DEFAULT NULL::pgml.runtime,
32+
automatic_deploy boolean DEFAULT true,
33+
materialize_snapshot boolean DEFAULT false
34+
)
35+
36+
RETURNS TABLE(project text, task text, algorithm text, deployed boolean)
37+
LANGUAGE c
38+
AS '$libdir/pgml', $function$train_wrapper$function$

‎pgml-extension/sql/schema.sql

Copy file name to clipboardExpand all lines: pgml-extension/sql/schema.sql
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ CREATE TABLE IF NOT EXISTS pgml.snapshots(
7070
columns JSONB,
7171
analysis JSONB,
7272
created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT clock_timestamp(),
73-
updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT clock_timestamp()
73+
updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT clock_timestamp(),
74+
materialized BOOLEAN DEFAULT false
7475
);
7576
SELECT pgml.auto_updated_at('pgml.snapshots');
7677

‎pgml-extension/src/api.rs

Copy file name to clipboardExpand all lines: pgml-extension/src/api.rs
+23-7Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ fn train(
112112
test_sampling: default!(Sampling, "'last'"),
113113
runtime: Option<default!(Runtime, "NULL")>,
114114
automatic_deploy: Option<default!(bool, true)>,
115+
materialize_snapshot: default!(bool, false),
115116
) -> impl std::iter::Iterator<
116117
Item = (
117118
name!(project, String),
@@ -134,6 +135,7 @@ fn train(
134135
test_sampling,
135136
runtime,
136137
automatic_deploy,
138+
materialize_snapshot,
137139
)
138140
}
139141

@@ -153,6 +155,7 @@ fn train_joint(
153155
test_sampling: default!(Sampling, "'last'"),
154156
runtime: Option<default!(Runtime, "NULL")>,
155157
automatic_deploy: Option<default!(bool, true)>,
158+
materialize_snapshot: default!(bool, false),
156159
) -> impl std::iter::Iterator<
157160
Item = (
158161
name!(project, String),
@@ -196,13 +199,16 @@ fn train_joint(
196199
.expect("You must pass a `y_column_name` when you pass a `relation_name`"),
197200
test_size,
198201
test_sampling,
202+
materialize_snapshot,
199203
);
200204

201-
info!(
202-
"Snapshot of table \"{}\" created and saved in {}",
203-
relation_name,
204-
snapshot.snapshot_name(),
205-
);
205+
if materialize_snapshot {
206+
info!(
207+
"Snapshot of table \"{}\" created and saved in {}",
208+
relation_name,
209+
snapshot.snapshot_name(),
210+
);
211+
}
206212

207213
snapshot
208214
}
@@ -421,6 +427,7 @@ fn snapshot(
421427
vec![y_column_name.to_string()],
422428
test_size,
423429
test_sampling,
430+
true,
424431
);
425432
vec![(relation_name.to_string(), y_column_name.to_string())].into_iter()
426433
}
@@ -544,6 +551,7 @@ mod tests {
544551
vec!["target".to_string()],
545552
0.5,
546553
Sampling::last,
554+
true,
547555
);
548556
assert!(snapshot.id > 0);
549557
}
@@ -554,8 +562,13 @@ mod tests {
554562
load_diabetes(Some(25));
555563

556564
let result = std::panic::catch_unwind(|| {
557-
let _snapshot =
558-
Snapshot::create("diabetes", vec!["target".to_string()], 0.5, Sampling::last);
565+
let _snapshot = Snapshot::create(
566+
"diabetes",
567+
vec!["target".to_string()],
568+
0.5,
569+
Sampling::last,
570+
true,
571+
);
559572
});
560573

561574
assert!(result.is_err());
@@ -587,6 +600,7 @@ mod tests {
587600
Sampling::last,
588601
Some(runtime),
589602
Some(true),
603+
false,
590604
)
591605
.collect();
592606

@@ -624,6 +638,7 @@ mod tests {
624638
Sampling::last,
625639
Some(runtime),
626640
Some(true),
641+
false,
627642
)
628643
.collect();
629644

@@ -661,6 +676,7 @@ mod tests {
661676
Sampling::last,
662677
Some(runtime),
663678
Some(true),
679+
true,
664680
)
665681
.collect();
666682

‎pgml-extension/src/bindings/mod.rs

Copy file name to clipboardExpand all lines: pgml-extension/src/bindings/mod.rs
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,15 @@ mod tests {
5151
vec!["target".to_string()],
5252
0.5,
5353
Sampling::last,
54+
false,
5455
);
5556
let classification = Project::create("classification", Task::classification);
5657
let breast_cancer = Snapshot::create(
5758
"pgml.breast_cancer",
5859
vec!["malignant".to_string()],
5960
0.5,
6061
Sampling::last,
62+
false,
6163
);
6264

6365
let mut regressors = Vec::new();

‎pgml-extension/src/orm/snapshot.rs

Copy file name to clipboardExpand all lines: pgml-extension/src/orm/snapshot.rs
+42-16Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ pub struct Snapshot {
5959
pub analysis: Option<JsonB>,
6060
pub created_at: Timestamp,
6161
pub updated_at: Timestamp,
62+
pub materialized: bool,
6263
}
6364

6465
// Manually implemented since pgx::JsonB doesn't impl Clone
@@ -81,6 +82,7 @@ impl Clone for Snapshot {
8182
.map(|analysis| JsonB(analysis.0.clone())),
8283
created_at: self.created_at,
8384
updated_at: self.updated_at,
85+
materialized: self.materialized,
8486
}
8587
}
8688
}
@@ -107,7 +109,8 @@ impl Snapshot {
107109
snapshots.columns,
108110
snapshots.analysis,
109111
snapshots.created_at,
110-
snapshots.updated_at
112+
snapshots.updated_at,
113+
snapshots.materialized
111114
FROM pgml.snapshots
112115
WHERE id = $1
113116
ORDER BY snapshots.id DESC
@@ -129,6 +132,7 @@ impl Snapshot {
129132
analysis: result.get_datum(8),
130133
created_at: result.get_datum(9).unwrap(),
131134
updated_at: result.get_datum(10).unwrap(),
135+
materialized: result.get_datum(11).unwrap(),
132136
});
133137
}
134138
Ok(Some(1))
@@ -151,7 +155,8 @@ impl Snapshot {
151155
snapshots.columns,
152156
snapshots.analysis,
153157
snapshots.created_at,
154-
snapshots.updated_at
158+
snapshots.updated_at,
159+
snapshots.materialized
155160
FROM pgml.snapshots
156161
JOIN pgml.models
157162
ON models.snapshot_id = snapshots.id
@@ -178,6 +183,7 @@ impl Snapshot {
178183
analysis: result.get_datum(8),
179184
created_at: result.get_datum(9).unwrap(),
180185
updated_at: result.get_datum(10).unwrap(),
186+
materialized: result.get_datum(11).unwrap(),
181187
});
182188
}
183189
Ok(Some(1))
@@ -190,6 +196,7 @@ impl Snapshot {
190196
y_column_name: Vec<String>,
191197
test_size: f32,
192198
test_sampling: Sampling,
199+
materialized: bool,
193200
) -> Snapshot {
194201
let mut snapshot: Option<Snapshot> = None;
195202
let status = Status::in_progress;
@@ -198,14 +205,15 @@ impl Snapshot {
198205
let (_schema_name, _table_name) = Self::fully_qualified_table(relation_name);
199206

200207
Spi::connect(|client| {
201-
let result = client.select("INSERT INTO pgml.snapshots (relation_name, y_column_name, test_size, test_sampling, status) VALUES ($1, $2, $3, $4::pgml.sampling, $5::pgml.status) RETURNING id, relation_name, y_column_name, test_size, test_sampling::TEXT, status::TEXT, columns, analysis, created_at, updated_at;",
208+
let result = client.select("INSERT INTO pgml.snapshots (relation_name, y_column_name, test_size, test_sampling, status, materialized) VALUES ($1, $2, $3, $4::pgml.sampling, $5::pgml.status, $6) RETURNING id, relation_name, y_column_name, test_size, test_sampling::TEXT, status::TEXT, columns, analysis, created_at, updated_at;",
202209
Some(1),
203210
Some(vec![
204211
(PgBuiltInOids::TEXTOID.oid(), relation_name.into_datum()),
205212
(PgBuiltInOids::TEXTARRAYOID.oid(), y_column_name.into_datum()),
206213
(PgBuiltInOids::FLOAT4OID.oid(), test_size.into_datum()),
207214
(PgBuiltInOids::TEXTOID.oid(), test_sampling.to_string().into_datum()),
208215
(PgBuiltInOids::TEXTOID.oid(), status.to_string().into_datum()),
216+
(PgBuiltInOids::BOOLOID.oid(), materialized.clone().into_datum()),
209217
])
210218
).first();
211219
let mut s = Snapshot {
@@ -219,15 +227,18 @@ impl Snapshot {
219227
analysis: None, // 8
220228
created_at: result.get_datum(9).unwrap(),
221229
updated_at: result.get_datum(10).unwrap(),
230+
materialized,
222231
};
223-
let mut sql = format!(
224-
r#"CREATE TABLE "pgml"."snapshot_{}" AS SELECT * FROM {}"#,
225-
s.id, s.relation_name
226-
);
227-
if s.test_sampling == Sampling::random {
228-
sql += " ORDER BY random()";
232+
if materialized {
233+
let mut sql = format!(
234+
r#"CREATE TABLE "pgml"."snapshot_{}" AS SELECT * FROM {}"#,
235+
s.id, s.relation_name
236+
);
237+
if s.test_sampling == Sampling::random {
238+
sql += " ORDER BY random()";
239+
}
240+
client.select(&sql, None, None);
229241
}
230-
client.select(&sql, None, None);
231242
s.analyze();
232243
snapshot = Some(s);
233244
Ok(Some(1))
@@ -424,10 +435,7 @@ impl Snapshot {
424435
}
425436

426437
let stats = stats.join(", ");
427-
let sql = format!(
428-
r#"SELECT {stats} FROM "pgml"."snapshot_{}" {laterals}"#,
429-
self.id
430-
);
438+
let sql = format!(r#"SELECT {stats} FROM {} {laterals}"#, self.relation_name(),);
431439
let result = client.select(&sql, Some(1), None).first();
432440
let mut analysis = HashMap::new();
433441
for (i, field) in fields.iter().enumerate() {
@@ -476,13 +484,24 @@ impl Snapshot {
476484
let mut columns: Vec<Column> = serde_json::from_value(json).unwrap();
477485
columns.sort();
478486
let sql = format!(
479-
"SELECT {} FROM {}",
487+
"SELECT {} FROM {} {}",
480488
columns
481489
.iter()
482490
.map(|c| c.quoted_name())
483491
.collect::<Vec<String>>()
484492
.join(", "),
485-
self.snapshot_name()
493+
self.relation_name(),
494+
match self.materialized {
495+
// If the snapshot is materialized, we already randomized it.
496+
true => "",
497+
false => {
498+
if self.test_sampling == Sampling::random {
499+
"ORDER BY random()"
500+
} else {
501+
""
502+
}
503+
}
504+
},
486505
);
487506

488507
let mut num_labels: usize = 0;
@@ -624,4 +643,11 @@ impl Snapshot {
624643
pub fn snapshot_name(&self) -> String {
625644
format!("\"pgml\".\"snapshot_{}\"", self.id)
626645
}
646+
647+
pub fn relation_name(&self) -> String {
648+
match self.materialized {
649+
true => self.snapshot_name(),
650+
false => self.relation_name.clone(),
651+
}
652+
}
627653
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.