Top IMDB Movie
Tags: #python #webscraping #imdb #analytics
This notebook will help you in getting the top movies on IMDB by genre

Input

Import Library

1
try:
2
import scrapy
3
except:
4
!pip install scrapy
5
import scrapy
6
from scrapy.crawler import CrawlerProcess
7
from scrapy.crawler import CrawlerRunner
8
try:
9
from crochet import setup, wait_for
10
except:
11
!pip install crochet
12
from crochet import setup, wait_for
13
setup()
Copied!

Model

1
class IMDB(scrapy.Spider):
2
"""The scraping class"""
3
name="movies"
4
# Writing the output to a csv file and saving it as sample.csv
5
custom_settings = {'FEEDS': {'sample.csv': {"format": 'csv','overwrite': True}}}
6
7
def start_requests(self):
8
"""The start requests method that holds the url and processes it then send it to the parse method"""
9
urls=["https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action",
10
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=adventure",
11
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation",
12
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=fantasy",
13
"https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance"]
14
for url in urls:
15
yield scrapy.Request(url=url,callback=self.parse)
16
17
def parse(self, response):
18
"""The method that is used for subseting and scraping the websites into acceptable formats"""
19
movies=response.css("div.lister-item-content")
20
for movie in movies:
21
items={
22
"title" :movie.css("h3.lister-item-header").css("a::text").get(),
23
"year":movie.css("span.lister-item-year.text-muted.unbold::text").get().replace("(","").replace(")","").replace("I",""),
24
"rating":movie.css("span.certificate::text").get(),
25
"duration":movie.css("span.runtime::text").get(),
26
"genre":movie.css("span.genre::text").get().strip(),
27
"Total vote rating":movie.css("div.inline-block.ratings-imdb-rating>strong::text").get(),
28
"Number of votes":movie.css("p.sort-num_votes-visible>span:nth-of-type(2)::text").get(),
29
"Director":movie.css("p:nth-of-type(3)>a:nth-of-type(1)::text").get()
30
}
31
yield items
32
#You can delete the next 3 lines if you need just the first page and not all pages.
33
next_page=response.css("div.desc>a::attr(href)").get()
34
if next_page is not None:
35
yield response.follow(next_page,callback=self.parse)
36
@wait_for(10) # To avoid reactor time error
37
def run_spider():
38
"""run spider"""
39
crawler = CrawlerRunner()
40
result = crawler.crawl(IMDB)
41
return result
Copied!

Output

1
run_spider()
Copied!
Last modified 2mo ago
Copy link
Edit on GitHub