I'm building a scraper for www.apkmirror.com using Scrapy (with the SitemapSpider spider). So far the following works:
DEBUG = True
from scrapy.spiders import SitemapSpider
from apkmirror_scraper.items import ApkmirrorScraperItem
class ApkmirrorSitemapSpider(SitemapSpider):
name = 'apkmirror-spider'
sitemap_urls = ['http://www.apkmirror.com/sitemap_index.xml']
sitemap_rules = [(r'.*-android-apk-download/$', 'parse')]
if DEBUG:
custom_settings = {'CLOSESPIDER_PAGECOUNT': 20}
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
item['developer'] = response.xpath('//h3[#title]/a/text()').extract_first()
return item
where the ApkMirrorScraperItem is defined in items.py as follows:
class ApkmirrorScraperItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
developer = scrapy.Field()
The resulting JSON output if I run it from the project directory using the command
scrapy crawl apkmirror-spider -o data.json
is an array of JSON dictionaries with keys url, title, and developer, and the corresponding strings as values. I would like to modify this, however, so that the value of developer is itself a dictionary with a name field, so that I can populate it like this:
item['developer']['name'] = response.xpath('//h3[#title]/a/text()').extract_first()
However, if I try this I get KeyErrors, also if I initialize the developer's Field (which is a dict according to https://doc.scrapy.org/en/latest/topics/items.html#item-fields) as developer = scrapy.Field(name=None). How can I go about this?
Scrapy implements fields internally as dicts, but this does not mean they should be accessed as dicts. When you call item['developer'], what you are really doing is getting the value of the field, not the field itself. So, if the value has not been set yet, this will throw a KeyError.
Considering this, there are two ways you could go about your problem.
First one, just set the developer field value to a dict:
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
item['developer'] = {'name': response.xpath('//h3[#title]/a/text()').extract_first()}
return item
Second one, create a new Developer class and set the developer value to be an instance of this class:
# this can go to items.py
class Developer(scrapy.Item):
name = scrapy.Field()
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
dev = Developer()
dev['name'] = response.xpath('//h3[#title]/a/text()').extract_first()
item['developer'] = dev
return item
Hope it helps :)
Related
I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
I am getting Blank csv, though its not showing any error in code.
It is unable to crawl through web page.
This is the code which I have written referring youtube:-
import scrapy
from Example.items import MovieItem
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_url = ('http://www.imdb.com/chart/top',)
def parse(self,response):
links = response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i = 1
for link in links:
abs_url = response.urljoin(link)
#
url_next = '//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating = response.xpath(url_next).extact()
if (i <= len(link)):
i=i+1
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'rating': rating})
def parse_indetail(self,response):
item = MovieItem()
#
item['title'] = response.xpath('//div[#class="title_wrapper"])/h1/text()').extract[0][:-1]
item['directors'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="director"]/a/span/text()').extract()[0]
item['writers'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="creator"]/a/span/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="actors"]/a/span/text()').extract()
item['popularity'] = response.xpath('//div[#class="titleReviewBarSubItem"]/div/span/text()').extract()[2][21:-8]
return item
This is output I am getting while running executing code with
scrapy crawl imdbtestspider -o example.csv -t csv
2019-01-17 18:44:34 [scrapy.core.engine] INFO: Spider opened
2019-01-17 18:44:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages
(at 0 pag es/min), scraped 0 items (at 0 items/min)
This is another way you might give a try with. I used css selector instead of xpath to make the script less verbose.
import scrapy
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdspider'
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for link in response.css(".titleColumn a[href^='/title/']::attr(href)").extract():
yield scrapy.Request(response.urljoin(link),callback=self.get_info)
def get_info(self, response):
item = {}
title = response.css(".title_wrapper h1::text").extract_first()
item['title'] = ' '.join(title.split()) if title else None
item['directors'] = response.css(".credit_summary_item h4:contains('Director') ~ a::text").extract()
item['writers'] = response.css(".credit_summary_item h4:contains('Writer') ~ a::text").extract()
item['stars'] = response.css(".credit_summary_item h4:contains('Stars') ~ a::text").extract()
popularity = response.css(".titleReviewBarSubItem:contains('Popularity') .subText::text").extract_first()
item['popularity'] = ' '.join(popularity.split()).strip("(") if popularity else None
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
I have tested you given xpaths i don't know they are mistakenly wrong or are actually wrong.
e.g;
xpath = //*="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()
#There is not table when you reach at div[2]
//div[#class="title_wrapper"])/h1/text() #here there is and error after `]` ) is bad syntax
Plus your xpaths are not yielding any results.
As to why you are getting the error that says 0/pages crawled, despite not recreating your case, I have to assume that your method of page iteration is not building the page URLs correctly.
I'm having trouble understanding the use for creating the variable array of all the "follow links" and then using len to send them to the parse_indetail() but a couple things to note.
When your using "meta" to pass items from one function to the next, though you have the right idea, you are missing some the instantiation to the function your passing it to (you should also be using a standard naming convention for simplicity)
Should be something like this...
def parse(self,response):
# If you are going to capture an item at the first request, you must instantiate
# your items class
item = MovieItem()
....
# You seem to want to pass ratings to the next function for itimization, so
# you make sure that you have it listed in your items.py file and you set it
item[rating] = response.xpath(PATH).extact() # Why did you ad the url_next? huh?
....
# Standard convention for passing meta using call back is like this, this way
# allows you to pass multiple itemized item gets passed
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'item': item})
def parse_indetail(self,response):
# Then you must initialize the meta again in the function your passing it to
item = response.meta['item']
# Then you can continue your scraping
You should not complicate the page iteration logic. You seem to get how it works but need help fine tuning this aspect. I have recreated you use case and optimized it.
#items.py file
import scrapy
class TestimbdItem(scrapy.Item):
title = scrapy.Field()
directors = scrapy.Field()
writers = scrapy.Field()
stars = scrapy.Field()
popularity = scrapy.Field()
rating = scrapy.Field()
# The spider file
import scrapy
from testimbd.items import TestimbdItem
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdsdpyder'
allowed_domains = ['imdb.com']
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for href in response.css("td.titleColumn a::attr(href)").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_movie)
def parse_movie(self, response):
item = TestimbdItem()
item['title'] = [ x.replace('\xa0', '') for x in response.css(".title_wrapper h1::text").extract()][0]
item['directors'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Director")]/following-sibling::a/text()').extract()
item['writers'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Writers")]/following-sibling::a/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Stars")]/following-sibling::a/text()').extract()
item['popularity'] = response.css(".titleReviewBarSubItem span.subText::text")[2].re('([0-9]+)')
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
Notice two things:
Id the parse() function. All I'm doing here is using a for loop through the links, each instance in loop referred to href, and pass the urljoined href to the parser function. Give your use case, this is more than enough. In a situation where you have the next page, it's just creating a variable for the "next" page somehow and callback to parse, it will keep doing that till it cant fint a "next" page.
Secondly, Use xpath only when in the HTML items have the same tagwith different content. This is more of a personal opinion but I tell people that xpath selectors is like scalpel and css selectors is like a butcher knife. You can get damn accurate with scalpel but it takes more time and in many cases may be just easier to go with CSS selector to get the same result.
I'm fairly new to scrapy and I'm looking for a solution for my personal exercise. What I'm trying to do is to crawl IMDB top chart movies to get the ranking, the title, the year, and the plot.
I manage to go through the links and crawl the movie pages but I can't find a way to get the ranking for each movie.
Currently my code looks like this :
import scrapy
from tutorial.items import IMDB_dict # We need this so that Python knows about the item object
class MppaddressesSpider(scrapy.Spider):
name = "mppaddresses" # The name of this spider
# The allowed domain and the URLs where the spider should start crawling:
allowed_domains = ["imdb.com"]
start_urls = ['https://www.imdb.com/chart/top/']
def parse(self, response):
# The main method of the spider. It scrapes the URL(s) specified in the
# 'start_url' argument above. The content of the scraped URL is passed on
# as the 'response' object.
for rank in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract():
rank=" ".join(rank.split())
item = IMDB_dict()
item['rank'] = rank
for url in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/a/#href").extract():
# This loops through all the URLs found inside an element of class 'mppcell'
# Constructs an absolute URL by combining the response’s URL with a possible relative URL:
full_url = response.urljoin(url)
print("FOOOOOOOOOnd URL: "+full_url)
# The following tells Scrapy to scrape the URL in the 'full_url' variable
# and calls the 'get_details() method below with the content of this
# URL:
#yield {'namyy' : response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']/text()").extract().strip("\t\r\n '\""),}
yield scrapy.Request(full_url, callback=self.get_details)
def get_details(self, response):
# This method is called on by the 'parse' method above. It scrapes the URLs
# that have been extracted in the previous step.
#item = OntariomppsItem() # Creating a new Item object
# Store scraped data into that item:
item = IMDB_dict()
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
Besides, my item.py has:
import scrapy
class IMDB_dict(scrapy.Item):
# define the fields for your item here like:
rank = scrapy.Field()
name = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
Main question: How can I get the ranking associated with the title?
Last question (if possible): I can access URLs like I did when URLs are relative (with urljoin), but I can't find a way to access URLs when they are absolute...
Many thanks for your help.
Best,
You need to send rank to your get_details callback using meta:
def parse(self, response):
for movie in response.xpath(".//tbody[#class='lister-list']/tr/td[#class='titleColumn']"):
movie_rank = movie.xpath('./text()').re_first(r'(\d+)')
movie_url = movie.xpath('./a/#href').extract_first()
movie_full_url = response.urljoin(movie_url)
print("FOOOOOOOOOnd URL: " + movie_url)
yield scrapy.Request(movie_full_url, callback=self.get_details, meta={"rank": movie_rank})
def get_details(self, response):
item = IMDB_dict()
item['rank'] = response.meta["rank"]
item['name'] = response.xpath(".//div[#class='title_bar_wrapper']/div[#class='titleBar']/div[#class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
item['phone'] = response.xpath(".//div[#class='titleBar']/div[#class='title_wrapper']/h1/span[#id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
item['email'] = response.xpath(".//div[#class='plot_summary ']/div[#class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
# Return that item to the main spider method:
yield item
UPDATE
If you check logs you'll find this error
AttributeError: 'NoneType' object has no attribute 'strip'
Sometimes .extract_first() returns None and you can't strip() it. I recommend you to use Scrapy Item Loaders
I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.
I've built a crawler using scrapy to crawl into a sitemap and scrape required components from all the links in the sitemap.
class MySpider(SitemapSpider):
name = "functie"
allowed_domains = ["xyz.nl"]
sitemap_urls = ["http://www.xyz.nl/sitemap.xml"]
def parse(self, response):
item = MyItem()
sel = Selector(response)
item['url'] = response.url
item['h1'] = sel.xpath("//h1[#class='no-bd']/text()").extract()
item['jobtype'] = sel.xpath('//input[#name=".Keyword"]/#value').extract()
item['count'] = sel.xpath('//input[#name="Count"]/#value').extract()
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
yield item
The item['location'] can have null values at some cases. In that particular case i want to scrape other component and store it in item['location'].
The code i've tried is:
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
if not item['location']:
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
But it doesn't checks the if-condition and returns empty if value is empty in the input field for location. Any help would be highly useful.
You may wish to check the length of item['location'] instead.
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
if len(item['location']) < 1:
item['location'] = sel.xpath(//a[#class="location"]/text()').extract()')
Regardless, have you considered combining the two xpaths with a |?
item['location'] = sel.xpath('//input[#name="Location"]/#value | //a[#class="location"]/text()').extract()'
Try this approach:
if(item[location]==""):
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
I think what you are trying to achieve is best solved with a custom item pipeline.
1) Open pipelines.py and check your desired if condition within a Pipeline class:
class LocPipeline(object):
def process_item(self, item, spider):
# check if key "location" is in item dict
if not item.get("location"):
# if not, try specific xpath
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
else:
# if location was already found, do nothing
pass
return item
2) The next step is to add the custom LocPipeline() to your settings.py file:
ITEM_PIPELINES = {'myproject.pipelines.LocPipeline': 300}
Adding the custom pipeline to your settings, scrapy will automatically call the LocPipeline().process_item() after MySpider().parse() and search for the alternative XPath if no location is found yet.