Amazing-Python-Scripts/IMDB-Scraper/scraper.py at master · puzzithinker/Amazing-Python-Scripts

History

86 lines (74 loc) · 3.5 KB

Raw

import requests

from bs4 import BeautifulSoup as bs

import argparse

parser = argparse.ArgumentParser(description='IMDB Scraper')

parser.add_argument('--t', action='store', type=str, required=True,

help='Enter the title of the movie')

# Base id url is used when the title id is known

base_id = "https://www.imdb.com/title"

# base url is used when the user gives a title to search for

base = "https://www.imdb.com/find?s=tt&q="

def get_info(soup):

info = {}

labels = ["title", "year", "rating", "genre", "plot", "date", "country",

"language", "budget", "gross", "gross_usa", "opening_week_usa"]

try:

info["title"] = soup.find(

'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)

info["year"] = soup.find(

'span', attrs={"id": "titleYear"}).a.get_text(strip=True)

info["rating"] = soup.find(

'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)

subtext = soup.find("div", attrs={"class": "subtext"})

info["genre"] = subtext.a.get_text(strip=True)

article = soup.find('div', attrs={"id": "titleStoryLine"})

info["plot"] = article.find(

'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)

details = soup.find('div', attrs={"id": "titleDetails"})

blocks = details.findAll('div', attrs={"class": "txt-block"})

for block in blocks:

heading = block.h4.get_text(strip=True)

if heading == "Release Date:":

info["date"] = block.get_text(strip=True).replace(

"See more»", '').replace(heading, '')

if heading == "Country:":

info["country"] = block.a.get_text(strip=True)

if heading == "Language":

info["language"] = block.a.get_text(strip=True)

if heading == "Budget:":

info["budget"] = block.get_text(

strip=True).replace(heading, '')

if heading == "Cumulative Worldwide Gross:":

info["gross"] = block.get_text(

strip=True).replace(heading, '')

if heading == "Gross USA:":

info["gross_usa"] = block.get_text(

strip=True).replace(heading, '')

if heading == "Opening Weekend USA:":

info["opening_week_usa"] = block.get_text(

strip=True).replace(heading, '')

except:

assert any(obj in labels for obj in info), "No info found"

if len(info) > 4:

print(info, end="\n\n\n")

def find_movie(query):

url = base+query

resp = requests.get(url)

# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'

soup1 = bs(resp.text, 'lxml')

# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them

movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]

if movie_list:

for movie in movie_list:

# Through the table given , we extract the title id from the 'href' attribute of the <a> tag

title_id = movie.find(

'td', attrs={"class": "result_text"}).a.attrs["href"][6:]

url = base_id+title_id

respo = requests.get(base_id+title_id)

soup = bs(respo.text, 'lxml')

get_info(soup)

else:

print("No results found")

if __name__ == "__main__":

args = parser.parse_args()

find_movie(args.t)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Expand file tree

Search code, repositories, users, issues, pull requests...

FilesExpand file tree

scraper.py

Latest commit

History

scraper.py

File metadata and controls

Expand file tree