IMDB Movie Scraping gives blank csv using scrapy - python

I am getting Blank csv, though its not showing any error in code.
It is unable to crawl through web page.
This is the code which I have written referring youtube:-
import scrapy
from Example.items import MovieItem
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_url = ('http://www.imdb.com/chart/top',)
def parse(self,response):
links = response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i = 1
for link in links:
abs_url = response.urljoin(link)
#
url_next = '//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating = response.xpath(url_next).extact()
if (i <= len(link)):
i=i+1
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'rating': rating})
def parse_indetail(self,response):
item = MovieItem()
#
item['title'] = response.xpath('//div[#class="title_wrapper"])/h1/text()').extract[0][:-1]
item['directors'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="director"]/a/span/text()').extract()[0]
item['writers'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="creator"]/a/span/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="actors"]/a/span/text()').extract()
item['popularity'] = response.xpath('//div[#class="titleReviewBarSubItem"]/div/span/text()').extract()[2][21:-8]
return item
This is output I am getting while running executing code with
scrapy crawl imdbtestspider -o example.csv -t csv
2019-01-17 18:44:34 [scrapy.core.engine] INFO: Spider opened
2019-01-17 18:44:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages
(at 0 pag es/min), scraped 0 items (at 0 items/min)

This is another way you might give a try with. I used css selector instead of xpath to make the script less verbose.
import scrapy
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdspider'
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for link in response.css(".titleColumn a[href^='/title/']::attr(href)").extract():
yield scrapy.Request(response.urljoin(link),callback=self.get_info)
def get_info(self, response):
item = {}
title = response.css(".title_wrapper h1::text").extract_first()
item['title'] = ' '.join(title.split()) if title else None
item['directors'] = response.css(".credit_summary_item h4:contains('Director') ~ a::text").extract()
item['writers'] = response.css(".credit_summary_item h4:contains('Writer') ~ a::text").extract()
item['stars'] = response.css(".credit_summary_item h4:contains('Stars') ~ a::text").extract()
popularity = response.css(".titleReviewBarSubItem:contains('Popularity') .subText::text").extract_first()
item['popularity'] = ' '.join(popularity.split()).strip("(") if popularity else None
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item

I have tested you given xpaths i don't know they are mistakenly wrong or are actually wrong.
e.g;
xpath = //*="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()
#There is not table when you reach at div[2]
//div[#class="title_wrapper"])/h1/text() #here there is and error after `]` ) is bad syntax
Plus your xpaths are not yielding any results.

As to why you are getting the error that says 0/pages crawled, despite not recreating your case, I have to assume that your method of page iteration is not building the page URLs correctly.
I'm having trouble understanding the use for creating the variable array of all the "follow links" and then using len to send them to the parse_indetail() but a couple things to note.
When your using "meta" to pass items from one function to the next, though you have the right idea, you are missing some the instantiation to the function your passing it to (you should also be using a standard naming convention for simplicity)
Should be something like this...
def parse(self,response):
# If you are going to capture an item at the first request, you must instantiate
# your items class
item = MovieItem()
....
# You seem to want to pass ratings to the next function for itimization, so
# you make sure that you have it listed in your items.py file and you set it
item[rating] = response.xpath(PATH).extact() # Why did you ad the url_next? huh?
....
# Standard convention for passing meta using call back is like this, this way
# allows you to pass multiple itemized item gets passed
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'item': item})
def parse_indetail(self,response):
# Then you must initialize the meta again in the function your passing it to
item = response.meta['item']
# Then you can continue your scraping
You should not complicate the page iteration logic. You seem to get how it works but need help fine tuning this aspect. I have recreated you use case and optimized it.
#items.py file
import scrapy
class TestimbdItem(scrapy.Item):
title = scrapy.Field()
directors = scrapy.Field()
writers = scrapy.Field()
stars = scrapy.Field()
popularity = scrapy.Field()
rating = scrapy.Field()
# The spider file
import scrapy
from testimbd.items import TestimbdItem
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdsdpyder'
allowed_domains = ['imdb.com']
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for href in response.css("td.titleColumn a::attr(href)").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_movie)
def parse_movie(self, response):
item = TestimbdItem()
item['title'] = [ x.replace('\xa0', '') for x in response.css(".title_wrapper h1::text").extract()][0]
item['directors'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Director")]/following-sibling::a/text()').extract()
item['writers'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Writers")]/following-sibling::a/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Stars")]/following-sibling::a/text()').extract()
item['popularity'] = response.css(".titleReviewBarSubItem span.subText::text")[2].re('([0-9]+)')
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
Notice two things:
Id the parse() function. All I'm doing here is using a for loop through the links, each instance in loop referred to href, and pass the urljoined href to the parser function. Give your use case, this is more than enough. In a situation where you have the next page, it's just creating a variable for the "next" page somehow and callback to parse, it will keep doing that till it cant fint a "next" page.
Secondly, Use xpath only when in the HTML items have the same tagwith different content. This is more of a personal opinion but I tell people that xpath selectors is like scalpel and css selectors is like a butcher knife. You can get damn accurate with scalpel but it takes more time and in many cases may be just easier to go with CSS selector to get the same result.

Related

How to scrape webpage of items, each item has link to new page

I'm creating a web scraper with scrapy and python. The page I'm scraping has each item structured as a card, I'm able to scrape some info from these cards (name, location), but I also want to get info that is reached by clicking on card > new page > click button on new page that opens form > scrape value from the form. How should I structure the parse function, do I need nested loops or separate functions ..?
class StackSpider(Spider):
name = "stack"
allowed_domains = ["example.com"]
start_urls = ["example.com/page"]
def parse(self, response):
for page_url in response.css('a[class ~= search- card]::attr(href)').extract():
page_url = response.urljoin(page_url)
yield scrapy.Request(url=page_url, callback=self.parse)
for vc in response.css('div#vc-profile.container').extract():
item = StackItem()
item['name'] = vc.xpath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[1]/h1/text()').extract()
item['firm'] = vc.expath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[2]/h2/text()[1]').extract()
item['pos'] = vc.expath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[2]/h2/text()[2]').extract()
em = vc.xpath('/*[#id="vc-profile"]/div/div[1]/div[2]/div[2]/div/div[1]/button').extract()
item['email'] = em.xpath('//*[#id="email"]/value').extract()
yield item
the scraper is crawling, but outputting nothing
The best approach is creating an item object on the first page, scrape the needed data and save to the item. Again make a request to the new URL (card > new page > click the button to form) and pass the same item in there. Yielding the output from here will fix the issue.
You should probably split the scraper into 1 'parse' method and 1 'parse_item' method.
Your parse method goes through the page and yields the urls of the items for which you want to get the details. The parse_item method will get back the response from the parse function, and get the details for the specific item.
Difficult to say what it will look like without knowing the website, but it'll probably look more or less like this:
class StackSpider(Spider):
name = "stack"
allowed_domains = ["example.com"]
start_urls = ["example.com/page"]
def parse(self, response):
for page_url in response.css('a[class ~= search- card]::attr(href)').extract():
page_url = response.urljoin(page_url)
yield scrapy.Request(url=page_url, callback=self.parse_item)
def parse_item(self, response)
item = StackItem()
item['name'] = vc.xpath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[1]/h1/text()').extract()
item['firm'] = vc.expath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[2]/h2/text()[1]').extract()
item['pos'] = vc.expath('//*[#id="vc-profile"]/div/div[2]/div[1]/div[2]/h2/text()[2]').extract()
em = vc.xpath('/*[#id="vc-profile"]/div/div[1]/div[2]/div[2]/div/div[1]/button').extract()
item['email'] = em.xpath('//*[#id="email"]/value').extract()
yield item

scrapy : scrape multiple items from 2 levels

I'm fairly new to scrapy and I'm looking for a solution for my personal exercise. What I'm trying to do is to crawl IMDB top chart movies to get the ranking, the title, the year, and the plot.
I manage to go through the links and crawl the movie pages but I can't find a way to get the ranking for each movie.
Currently my code looks like this :
import scrapy
from tutorial.items import IMDB_dict # We need this so that Python knows about the item object
class MppaddressesSpider(scrapy.Spider):
name = "mppaddresses" # The name of this spider
# The allowed domain and the URLs where the spider should start crawling:
allowed_domains = ["imdb.com"]
start_urls = ['https://www.imdb.com/chart/top/']
def parse(self, response):
# The main method of the spider. It scrapes the URL(s) specified in the
# 'start_url' argument above. The content of the scraped URL is passed on
# as the 'response' object.
for rank in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract():
rank=" ".join(rank.split())
item = IMDB_dict()
item['rank'] = rank
for url in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/a/#href").extract():
# This loops through all the URLs found inside an element of class 'mppcell'
# Constructs an absolute URL by combining the response’s URL with a possible relative URL:
full_url = response.urljoin(url)
print("FOOOOOOOOOnd URL: "+full_url)
# The following tells Scrapy to scrape the URL in the 'full_url' variable
# and calls the 'get_details() method below with the content of this
# URL:
#yield {'namyy' : response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract().strip("\t\r\n '\""),}
yield scrapy.Request(full_url, callback=self.get_details)
def get_details(self, response):
# This method is called on by the 'parse' method above. It scrapes the URLs
# that have been extracted in the previous step.
#item = OntariomppsItem() # Creating a new Item object
# Store scraped data into that item:
item = IMDB_dict()
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
Besides, my item.py has:
import scrapy
class IMDB_dict(scrapy.Item):
# define the fields for your item here like:
rank = scrapy.Field()
name = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
Main question: How can I get the ranking associated with the title?
Last question (if possible): I can access URLs like I did when URLs are relative (with urljoin), but I can't find a way to access URLs when they are absolute...
Many thanks for your help.
Best,
You need to send rank to your get_details callback using meta:
def parse(self, response):
for movie in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']"):
movie_rank = movie.xpath('./text()').re_first(r'(\d+)')
movie_url = movie.xpath('./a/#href').extract_first()
movie_full_url = response.urljoin(movie_url)
print("FOOOOOOOOOnd URL: " + movie_url)
yield scrapy.Request(movie_full_url, callback=self.get_details, meta={"rank": movie_rank})
def get_details(self, response):
item = IMDB_dict()
item['rank'] = response.meta["rank"]
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
UPDATE
If you check logs you'll find this error
AttributeError: 'NoneType' object has no attribute 'strip'
Sometimes .extract_first() returns None and you can't strip() it. I recommend you to use Scrapy Item Loaders

IF Statement within Scrapy item declaration

I'm using scrapy to build a spider to monitor prices on a website. The website isn't consistent in how it displays it's prices. For it's standard price, it always uses the same CSS class, however when a product goes on promotion, it uses one of two CSS classes. The CSS selectors for both are below:
response.css('span.price-num:last-child::text').extract_first()
response.css('.product-highlight-label')
Below is how my items currently look within my spider:
item = ScraperItem()
item['model'] = extract_with_css('.product-id::text')
item['link'] = extract_with_css('head meta[property="og:url"]::attr(content)')
item['price'] = extract_with_css('span.list-price:last-child::text')
item['promo_price'] = extract_with_css('span.price-num:last-child::text')
yield item`
I would like to have something like:
IF response.css('span.price-num:last-child::text') is true
item['promo_price'] = extract_with_css('span.price-num:last-child::text')
ELSE item['promo_price'] = extract_with_css('.product-highlight-label')
Each way I've tried this I have failed.
I got it to work. Here's my code:
item = ScraperItem()
item['model'] = extract_with_css('.product-id::text')
item['link'] = extract_with_css('head meta[property="og:url"]::attr(content)')
item['price'] = extract_with_css('span.list-price:last-child::text')
if response.css('span.price-num:last-child::text'):
item['promo_price'] = extract_with_css('span.price-num:last-child::text')
else:
item['promo_price'] = extract_with_css('.product-highlight-label::text')
yield item

ScraPy spider crawling but not exporting

I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Categories

Resources