Links

Top IMDB Movie

Tags: #imdb #python #webscraping #imdb #analytics #operations #csv
This notebook will help you in getting the top movies on IMDB by genre

Input

Import libraries

try:
import scrapy
except:
!pip install scrapy
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
try:
from crochet import setup, wait_for
except:
!pip install crochet
from crochet import setup, wait_for
setup()

Model

class IMDB(scrapy.Spider):
"""The scraping class"""
name="movies"
# Writing the output to a csv file and saving it as sample.csv
custom_settings = {'FEEDS': {'sample.csv': {"format": 'csv','overwrite': True}}}
def start_requests(self):
"""The start requests method that holds the url and processes it then send it to the parse method"""
urls=["https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action",
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=adventure",
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation",
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=fantasy",
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance"]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
"""The method that is used for subseting and scraping the websites into acceptable formats"""
movies=response.css("div.lister-item-content")
for movie in movies:
items={
"title" :movie.css("h3.lister-item-header").css("a::text").get(),
"year":movie.css("span.lister-item-year.text-muted.unbold::text").get().replace("(","").replace(")","").replace("I",""),
"rating":movie.css("span.certificate::text").get(),
"duration":movie.css("span.runtime::text").get(),
"genre":movie.css("span.genre::text").get().strip(),
"Total vote rating":movie.css("div.inline-block.ratings-imdb-rating>strong::text").get(),
"Number of votes":movie.css("p.sort-num_votes-visible>span:nth-of-type(2)::text").get(),
"Director":movie.css("p:nth-of-type(3)>a:nth-of-type(1)::text").get()
}
yield items
#You can delete the next 3 lines if you need just the first page and not all pages.
next_page=response.css("div.desc>a::attr(href)").get()
if next_page is not None:
yield response.follow(next_page,callback=self.parse)
@wait_for(10) # To avoid reactor time error
def run_spider():
"""run spider"""
crawler = CrawlerRunner()
result = crawler.crawl(IMDB)
return result

Output

run_spider()
Last modified 1mo ago