Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 4c41d08

Browse filesBrowse files
committed
HDF copy object docstrings
1 parent 2b40269 commit 4c41d08
Copy full SHA for 4c41d08

File tree

Expand file treeCollapse file tree

1 file changed

+95
-0
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+95
-0
lines changed

‎pandas_to_postgres/copy_hdf.py

Copy file name to clipboardExpand all lines: pandas_to_postgres/copy_hdf.py
+95Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414

1515

1616
class HDFTableCopy(BaseCopy):
17+
"""
18+
Class for handling a standard case of reading a table from an HDF file into a pandas
19+
DataFrame, iterating over it in chunks, and COPYing to PostgreSQL via StringIO CSV
20+
"""
21+
1722
def __init__(
1823
self,
1924
hdf_tables: List[str],
@@ -24,6 +29,19 @@ def __init__(
2429
sql_table: str = None,
2530
csv_chunksize: int = 10 ** 6,
2631
):
32+
"""
33+
Parameters
34+
----------
35+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
36+
(assumption being that HDF tables:SQL tables is many:one)
37+
hdf_meta: HDFMetadata object with information from the store
38+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
39+
True, defer attributing these to the object until after pickled by Pool
40+
conn: SQLAlchemy connection managed outside of the object
41+
table_obj: SQLAlchemy object for the destination SQL Table
42+
sql_table: string of SQL table name
43+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
44+
"""
2745
super().__init__(defer_sql_objs, conn, table_obj, sql_table, csv_chunksize)
2846

2947
self.hdf_tables = hdf_tables
@@ -34,6 +52,17 @@ def __init__(
3452
self.hdf_chunksize = hdf_meta.chunksize
3553

3654
def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
55+
"""
56+
Go through sequence to COPY data to PostgreSQL table, including dropping Primary
57+
and Foreign Keys to optimize speed, TRUNCATE table, COPY data, recreate keys,
58+
and run ANALYZE.
59+
60+
Parameters
61+
----------
62+
data_formatters: list of functions to apply to df during sequence. Note that
63+
each of these functions should be able to handle kwargs for one another
64+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
65+
"""
3766
self.drop_fks()
3867
self.drop_pk()
3968

@@ -50,6 +79,15 @@ def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
5079
self.analyze()
5180

5281
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
82+
"""
83+
Copy each HDF table that relates to SQL table to database
84+
85+
Parameters
86+
----------
87+
data_formatters: list of functions to apply to df during sequence. Note that
88+
each of these functions should be able to handle kwargs for one another
89+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
90+
"""
5391
if self.hdf_tables is None:
5492
logger.warn(f"No HDF table found for SQL table {self.sql_table}")
5593
return
@@ -81,6 +119,11 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
81119

82120

83121
class SmallHDFTableCopy(HDFTableCopy):
122+
"""
123+
Class for handling the case where the table is small enough to be stored completely
124+
in-memory for both reading from the HDF as well as COPYing using StringIO.
125+
"""
126+
84127
def __init__(
85128
self,
86129
hdf_tables: List[str],
@@ -91,6 +134,19 @@ def __init__(
91134
sql_table: str = None,
92135
csv_chunksize: int = 10 ** 6,
93136
):
137+
"""
138+
Parameters
139+
----------
140+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
141+
(assumption being that HDF tables:SQL tables is many:one)
142+
hdf_meta: HDFMetadata object with information from the store
143+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
144+
True, defer attributing these to the object until after pickled by Pool
145+
conn: SQLAlchemy connection managed outside of the object
146+
table_obj: SQLAlchemy object for the destination SQL Table
147+
sql_table: string of SQL table name
148+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
149+
"""
94150
super().__init__(
95151
hdf_tables,
96152
hdf_meta,
@@ -102,6 +158,15 @@ def __init__(
102158
)
103159

104160
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
161+
"""
162+
Copy each HDF table that relates to SQL table to database
163+
164+
Parameters
165+
----------
166+
data_formatters: list of functions to apply to df during sequence. Note that
167+
each of these functions should be able to handle kwargs for one another
168+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
169+
"""
105170
if self.hdf_tables is None:
106171
logger.warn("No HDF table found for SQL table {self.sql_table}")
107172
return
@@ -129,6 +194,14 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
129194

130195

131196
class BigHDFTableCopy(HDFTableCopy):
197+
"""
198+
Class for handling the special case of particularly large tables. For these, we
199+
iterate over reading the table in the HDF as well as iterating again over each of
200+
those chunks in order to keep the number of rows stored in-memory to a reasonable
201+
size. Note that these are iterated using pd.read_hdf(..., start, stop) rather than
202+
pd.read_hdf(..., iterator=True) because we found the performance was much better.
203+
"""
204+
132205
def __init__(
133206
self,
134207
hdf_tables: List[str],
@@ -139,6 +212,19 @@ def __init__(
139212
sql_table: str = None,
140213
csv_chunksize: int = 10 ** 6,
141214
):
215+
"""
216+
Parameters
217+
----------
218+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
219+
(assumption being that HDF tables:SQL tables is many:one)
220+
hdf_meta: HDFMetadata object with information from the store
221+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
222+
True, defer attributing these to the object until after pickled by Pool
223+
conn: SQLAlchemy connection managed outside of the object
224+
table_obj: SQLAlchemy object for the destination SQL Table
225+
sql_table: string of SQL table name
226+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
227+
"""
142228
super().__init__(
143229
hdf_tables,
144230
hdf_meta,
@@ -150,6 +236,15 @@ def __init__(
150236
)
151237

152238
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
239+
"""
240+
Copy each HDF table that relates to SQL table to database
241+
242+
Parameters
243+
----------
244+
data_formatters: list of functions to apply to df during sequence. Note that
245+
each of these functions should be able to handle kwargs for one another
246+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
247+
"""
153248
if self.hdf_tables is None:
154249
logger.warn(f"No HDF table found for SQL table {self.sql_table}")
155250
return

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.