14
14
15
15
16
16
class HDFTableCopy (BaseCopy ):
17
+ """
18
+ Class for handling a standard case of reading a table from an HDF file into a pandas
19
+ DataFrame, iterating over it in chunks, and COPYing to PostgreSQL via StringIO CSV
20
+ """
21
+
17
22
def __init__ (
18
23
self ,
19
24
hdf_tables : List [str ],
@@ -24,6 +29,19 @@ def __init__(
24
29
sql_table : str = None ,
25
30
csv_chunksize : int = 10 ** 6 ,
26
31
):
32
+ """
33
+ Parameters
34
+ ----------
35
+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
36
+ (assumption being that HDF tables:SQL tables is many:one)
37
+ hdf_meta: HDFMetadata object with information from the store
38
+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
39
+ True, defer attributing these to the object until after pickled by Pool
40
+ conn: SQLAlchemy connection managed outside of the object
41
+ table_obj: SQLAlchemy object for the destination SQL Table
42
+ sql_table: string of SQL table name
43
+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
44
+ """
27
45
super ().__init__ (defer_sql_objs , conn , table_obj , sql_table , csv_chunksize )
28
46
29
47
self .hdf_tables = hdf_tables
@@ -34,6 +52,17 @@ def __init__(
34
52
self .hdf_chunksize = hdf_meta .chunksize
35
53
36
54
def copy (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
55
+ """
56
+ Go through sequence to COPY data to PostgreSQL table, including dropping Primary
57
+ and Foreign Keys to optimize speed, TRUNCATE table, COPY data, recreate keys,
58
+ and run ANALYZE.
59
+
60
+ Parameters
61
+ ----------
62
+ data_formatters: list of functions to apply to df during sequence. Note that
63
+ each of these functions should be able to handle kwargs for one another
64
+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
65
+ """
37
66
self .drop_fks ()
38
67
self .drop_pk ()
39
68
@@ -50,6 +79,15 @@ def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
50
79
self .analyze ()
51
80
52
81
def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
82
+ """
83
+ Copy each HDF table that relates to SQL table to database
84
+
85
+ Parameters
86
+ ----------
87
+ data_formatters: list of functions to apply to df during sequence. Note that
88
+ each of these functions should be able to handle kwargs for one another
89
+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
90
+ """
53
91
if self .hdf_tables is None :
54
92
logger .warn (f"No HDF table found for SQL table { self .sql_table } " )
55
93
return
@@ -81,6 +119,11 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
81
119
82
120
83
121
class SmallHDFTableCopy (HDFTableCopy ):
122
+ """
123
+ Class for handling the case where the table is small enough to be stored completely
124
+ in-memory for both reading from the HDF as well as COPYing using StringIO.
125
+ """
126
+
84
127
def __init__ (
85
128
self ,
86
129
hdf_tables : List [str ],
@@ -91,6 +134,19 @@ def __init__(
91
134
sql_table : str = None ,
92
135
csv_chunksize : int = 10 ** 6 ,
93
136
):
137
+ """
138
+ Parameters
139
+ ----------
140
+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
141
+ (assumption being that HDF tables:SQL tables is many:one)
142
+ hdf_meta: HDFMetadata object with information from the store
143
+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
144
+ True, defer attributing these to the object until after pickled by Pool
145
+ conn: SQLAlchemy connection managed outside of the object
146
+ table_obj: SQLAlchemy object for the destination SQL Table
147
+ sql_table: string of SQL table name
148
+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
149
+ """
94
150
super ().__init__ (
95
151
hdf_tables ,
96
152
hdf_meta ,
@@ -102,6 +158,15 @@ def __init__(
102
158
)
103
159
104
160
def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
161
+ """
162
+ Copy each HDF table that relates to SQL table to database
163
+
164
+ Parameters
165
+ ----------
166
+ data_formatters: list of functions to apply to df during sequence. Note that
167
+ each of these functions should be able to handle kwargs for one another
168
+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
169
+ """
105
170
if self .hdf_tables is None :
106
171
logger .warn ("No HDF table found for SQL table {self.sql_table}" )
107
172
return
@@ -129,6 +194,14 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
129
194
130
195
131
196
class BigHDFTableCopy (HDFTableCopy ):
197
+ """
198
+ Class for handling the special case of particularly large tables. For these, we
199
+ iterate over reading the table in the HDF as well as iterating again over each of
200
+ those chunks in order to keep the number of rows stored in-memory to a reasonable
201
+ size. Note that these are iterated using pd.read_hdf(..., start, stop) rather than
202
+ pd.read_hdf(..., iterator=True) because we found the performance was much better.
203
+ """
204
+
132
205
def __init__ (
133
206
self ,
134
207
hdf_tables : List [str ],
@@ -139,6 +212,19 @@ def __init__(
139
212
sql_table : str = None ,
140
213
csv_chunksize : int = 10 ** 6 ,
141
214
):
215
+ """
216
+ Parameters
217
+ ----------
218
+ hdf_tables: list of HDF keys with data corresponding to destination SQL table
219
+ (assumption being that HDF tables:SQL tables is many:one)
220
+ hdf_meta: HDFMetadata object with information from the store
221
+ defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
222
+ True, defer attributing these to the object until after pickled by Pool
223
+ conn: SQLAlchemy connection managed outside of the object
224
+ table_obj: SQLAlchemy object for the destination SQL Table
225
+ sql_table: string of SQL table name
226
+ csv_chunksize: max rows to keep in memory when generating CSV for COPY
227
+ """
142
228
super ().__init__ (
143
229
hdf_tables ,
144
230
hdf_meta ,
@@ -150,6 +236,15 @@ def __init__(
150
236
)
151
237
152
238
def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
239
+ """
240
+ Copy each HDF table that relates to SQL table to database
241
+
242
+ Parameters
243
+ ----------
244
+ data_formatters: list of functions to apply to df during sequence. Note that
245
+ each of these functions should be able to handle kwargs for one another
246
+ data_formatter_kwargs: list of kwargs to pass to data_formatters functions
247
+ """
153
248
if self .hdf_tables is None :
154
249
logger .warn (f"No HDF table found for SQL table { self .sql_table } " )
155
250
return
0 commit comments