This is my solution to a "vacancy test" task.
I'm not sure at all if I have correctly implemented the task, but here is my solution.
Goals of code:
- Parse rows of table from a URL and extract some data (make a list of jsons from this data).
- Parse rows of table from a URL and download PDF (link is placed inside a row).
Code:
import asyncio
import aiofiles
from aiohttp import ClientSession as aiohttp_ClientSession
from bs4 import BeautifulSoup
from pathlib import Path as pathlib_Path
from json import dumps as json_dumps
from loguru import logger
from sys import platform as sys_platform
# # # HARDCODED # # #
MIN_YEAR = 2018
MAX_YEAR = 2020
def is_key_in_dict(dict_key: str, array: list):
for i, dict_ in enumerate(array):
if dict_['form_number'] == dict_key:
return i
async def load_page(url: str, session, ):
async with session.get(url=url, ) as resp:
if 200 <= resp.status <= 299:
return await resp.read()
else:
logger.error(f'Can not access a page, returned status code: {resp.status_code}')
async def parse_page_forms_json(rows: list) -> list[str]: # Every page contain set of forms
page_forms = []
for row in rows:
try:
form_number = row.findChild(['LeftCellSpacer', 'a']).string.split(' (', 1)[0] # split to remove non eng ver
current_year = int(row.find(name='td', attrs={'class': 'EndCellSpacer'}).string.strip())
index = is_key_in_dict(dict_key=form_number, array=page_forms)
if index is None:
page_forms.append({
'form_number': form_number, # Title in reality
'form_title': row.find(name='td', attrs={'class': 'MiddleCellSpacer'}).string.strip(),
'min_year': current_year,
'max_year': current_year,
})
else: # If exists - modify form
if page_forms[index]['min_year'] > current_year:
page_forms[index]['min_year'] = current_year
elif page_forms[index]['max_year'] < current_year:
page_forms[index]['max_year'] = current_year
except Exception as e:
logger.error(f'Error: {e}')
return [json_dumps(page_form) for page_form in page_forms] # What to do whit this data?
async def save_page_form_pdf(rows, session):
for row in rows:
try:
current_year = int(row.find(name='td', attrs={'class': 'EndCellSpacer'}).string.strip())
form_number_elem = row.findChild(['LeftCellSpacer', 'a']) # link and name
form_number = form_number_elem.string.split(' (', 1)[0] # split to remove non eng ver
if MIN_YEAR <= current_year <= MAX_YEAR:
resp = await load_page(url=form_number_elem.attrs['href'], session=session)
# See https://docs.python.org/3/library/pathlib.html#pathlib.Path.mkdir
pathlib_Path(form_number).mkdir(parents=True, exist_ok=True) # exist_ok - skip FileExistsError
filename = f"{form_number}/{form_number}_{current_year}.pdf"
async with aiofiles.open(file=filename, mode='wb') as f:
await f.write(resp)
except Exception as e:
logger.error(f'Can not save file, error: {e}')
async def main():
async with aiohttp_ClientSession() as session:
tasks = []
pagination = 0
while 1:
url = (f'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?'
f'indexOfFirstRow={pagination}&'
f'sortColumn=sortOrder&'
f'value=&criteria=&'
f'resultsPerPage=200&'
f'isDescending=false')
page = await load_page(url=url, session=session)
soup = BeautifulSoup(page, 'html.parser')
table = soup.find(name='table', attrs={'class': 'picklist-dataTable'}) # Target
rows = table.find_all(name='tr')[1:] # [1:] - Just wrong HTML
if rows:
tasks.append(await parse_page_forms_json(rows=rows)) # Task 1
tasks.append(await save_page_form_pdf(rows=rows, session=session)) # Task 2
pagination += 200
else: # Stop pagination
break
await asyncio.gather(*tasks)
pass
if __name__ == '__main__':
# See https://github.com/encode/httpx/issues/914#issuecomment-622586610 (exit code is 0, but error is exists)
if sys_platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
logger.add('errors.txt', level='ERROR', rotation="30 days", backtrace=True)
asyncio.run(main())
The logic of code:
- Asynchronously get a page with table rows (tax forms)
- Do some actions with forms (blocking and non-blocking)
- Repeat
P.s. You can skip code of all functions except
main
,load_page
andsave_page_form_pdf
(they are no matter).
Questions:
- Have I picked a good architecture? Especially I'm curious about the self-implemented
if item not in list
python pattern because this pattern is good for a simple object inside a list, but I have a list of thedict
s and checking bydict
key. After some experiments, I didn't find a better realization than presented here. - Is an asynchronous solution good enough? It's my first experience with asynchronous web scraping.
- Maybe I need to optimize the check whether the form already exists in the list of forms, and not iterate over the entire list every time, but apply some sorting algorithm or something else?
- Do I need to add
await
keyword in atasks.append(await save_page_form_pdf(rows=rows, session=session))
line?