I've built a crawler using scrapy to crawl into a sitemap and scrape required components from all the links in the sitemap.
class MySpider(SitemapSpider):
name = "functie"
allowed_domains = ["xyz.nl"]
sitemap_urls = ["http://www.xyz.nl/sitemap.xml"]
def parse(self, response):
item = MyItem()
sel = Selector(response)
item['url'] = response.url
item['h1'] = sel.xpath("//h1[#class='no-bd']/text()").extract()
item['jobtype'] = sel.xpath('//input[#name=".Keyword"]/#value').extract()
item['count'] = sel.xpath('//input[#name="Count"]/#value').extract()
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
yield item
The item['location'] can have null values at some cases. In that particular case i want to scrape other component and store it in item['location'].
The code i've tried is:
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
if not item['location']:
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
But it doesn't checks the if-condition and returns empty if value is empty in the input field for location. Any help would be highly useful.
You may wish to check the length of item['location'] instead.
item['location'] = sel.xpath('//input[#name="Location"]/#value').extract()
if len(item['location']) < 1:
item['location'] = sel.xpath(//a[#class="location"]/text()').extract()')
Regardless, have you considered combining the two xpaths with a |?
item['location'] = sel.xpath('//input[#name="Location"]/#value | //a[#class="location"]/text()').extract()'
Try this approach:
if(item[location]==""):
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
I think what you are trying to achieve is best solved with a custom item pipeline.
1) Open pipelines.py and check your desired if condition within a Pipeline class:
class LocPipeline(object):
def process_item(self, item, spider):
# check if key "location" is in item dict
if not item.get("location"):
# if not, try specific xpath
item['location'] = sel.xpath('//a[#class="location"]/text()').extract()
else:
# if location was already found, do nothing
pass
return item
2) The next step is to add the custom LocPipeline() to your settings.py file:
ITEM_PIPELINES = {'myproject.pipelines.LocPipeline': 300}
Adding the custom pipeline to your settings, scrapy will automatically call the LocPipeline().process_item() after MySpider().parse() and search for the alternative XPath if no location is found yet.
Related
I am getting Blank csv, though its not showing any error in code.
It is unable to crawl through web page.
This is the code which I have written referring youtube:-
import scrapy
from Example.items import MovieItem
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_url = ('http://www.imdb.com/chart/top',)
def parse(self,response):
links = response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i = 1
for link in links:
abs_url = response.urljoin(link)
#
url_next = '//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating = response.xpath(url_next).extact()
if (i <= len(link)):
i=i+1
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'rating': rating})
def parse_indetail(self,response):
item = MovieItem()
#
item['title'] = response.xpath('//div[#class="title_wrapper"])/h1/text()').extract[0][:-1]
item['directors'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="director"]/a/span/text()').extract()[0]
item['writers'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="creator"]/a/span/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="actors"]/a/span/text()').extract()
item['popularity'] = response.xpath('//div[#class="titleReviewBarSubItem"]/div/span/text()').extract()[2][21:-8]
return item
This is output I am getting while running executing code with
scrapy crawl imdbtestspider -o example.csv -t csv
2019-01-17 18:44:34 [scrapy.core.engine] INFO: Spider opened
2019-01-17 18:44:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages
(at 0 pag es/min), scraped 0 items (at 0 items/min)
This is another way you might give a try with. I used css selector instead of xpath to make the script less verbose.
import scrapy
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdspider'
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for link in response.css(".titleColumn a[href^='/title/']::attr(href)").extract():
yield scrapy.Request(response.urljoin(link),callback=self.get_info)
def get_info(self, response):
item = {}
title = response.css(".title_wrapper h1::text").extract_first()
item['title'] = ' '.join(title.split()) if title else None
item['directors'] = response.css(".credit_summary_item h4:contains('Director') ~ a::text").extract()
item['writers'] = response.css(".credit_summary_item h4:contains('Writer') ~ a::text").extract()
item['stars'] = response.css(".credit_summary_item h4:contains('Stars') ~ a::text").extract()
popularity = response.css(".titleReviewBarSubItem:contains('Popularity') .subText::text").extract_first()
item['popularity'] = ' '.join(popularity.split()).strip("(") if popularity else None
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
I have tested you given xpaths i don't know they are mistakenly wrong or are actually wrong.
e.g;
xpath = //*="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()
#There is not table when you reach at div[2]
//div[#class="title_wrapper"])/h1/text() #here there is and error after `]` ) is bad syntax
Plus your xpaths are not yielding any results.
As to why you are getting the error that says 0/pages crawled, despite not recreating your case, I have to assume that your method of page iteration is not building the page URLs correctly.
I'm having trouble understanding the use for creating the variable array of all the "follow links" and then using len to send them to the parse_indetail() but a couple things to note.
When your using "meta" to pass items from one function to the next, though you have the right idea, you are missing some the instantiation to the function your passing it to (you should also be using a standard naming convention for simplicity)
Should be something like this...
def parse(self,response):
# If you are going to capture an item at the first request, you must instantiate
# your items class
item = MovieItem()
....
# You seem to want to pass ratings to the next function for itimization, so
# you make sure that you have it listed in your items.py file and you set it
item[rating] = response.xpath(PATH).extact() # Why did you ad the url_next? huh?
....
# Standard convention for passing meta using call back is like this, this way
# allows you to pass multiple itemized item gets passed
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'item': item})
def parse_indetail(self,response):
# Then you must initialize the meta again in the function your passing it to
item = response.meta['item']
# Then you can continue your scraping
You should not complicate the page iteration logic. You seem to get how it works but need help fine tuning this aspect. I have recreated you use case and optimized it.
#items.py file
import scrapy
class TestimbdItem(scrapy.Item):
title = scrapy.Field()
directors = scrapy.Field()
writers = scrapy.Field()
stars = scrapy.Field()
popularity = scrapy.Field()
rating = scrapy.Field()
# The spider file
import scrapy
from testimbd.items import TestimbdItem
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdsdpyder'
allowed_domains = ['imdb.com']
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for href in response.css("td.titleColumn a::attr(href)").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_movie)
def parse_movie(self, response):
item = TestimbdItem()
item['title'] = [ x.replace('\xa0', '') for x in response.css(".title_wrapper h1::text").extract()][0]
item['directors'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Director")]/following-sibling::a/text()').extract()
item['writers'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Writers")]/following-sibling::a/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Stars")]/following-sibling::a/text()').extract()
item['popularity'] = response.css(".titleReviewBarSubItem span.subText::text")[2].re('([0-9]+)')
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
Notice two things:
Id the parse() function. All I'm doing here is using a for loop through the links, each instance in loop referred to href, and pass the urljoined href to the parser function. Give your use case, this is more than enough. In a situation where you have the next page, it's just creating a variable for the "next" page somehow and callback to parse, it will keep doing that till it cant fint a "next" page.
Secondly, Use xpath only when in the HTML items have the same tagwith different content. This is more of a personal opinion but I tell people that xpath selectors is like scalpel and css selectors is like a butcher knife. You can get damn accurate with scalpel but it takes more time and in many cases may be just easier to go with CSS selector to get the same result.
I'm building a scraper for www.apkmirror.com using Scrapy (with the SitemapSpider spider). So far the following works:
DEBUG = True
from scrapy.spiders import SitemapSpider
from apkmirror_scraper.items import ApkmirrorScraperItem
class ApkmirrorSitemapSpider(SitemapSpider):
name = 'apkmirror-spider'
sitemap_urls = ['http://www.apkmirror.com/sitemap_index.xml']
sitemap_rules = [(r'.*-android-apk-download/$', 'parse')]
if DEBUG:
custom_settings = {'CLOSESPIDER_PAGECOUNT': 20}
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
item['developer'] = response.xpath('//h3[#title]/a/text()').extract_first()
return item
where the ApkMirrorScraperItem is defined in items.py as follows:
class ApkmirrorScraperItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
developer = scrapy.Field()
The resulting JSON output if I run it from the project directory using the command
scrapy crawl apkmirror-spider -o data.json
is an array of JSON dictionaries with keys url, title, and developer, and the corresponding strings as values. I would like to modify this, however, so that the value of developer is itself a dictionary with a name field, so that I can populate it like this:
item['developer']['name'] = response.xpath('//h3[#title]/a/text()').extract_first()
However, if I try this I get KeyErrors, also if I initialize the developer's Field (which is a dict according to https://doc.scrapy.org/en/latest/topics/items.html#item-fields) as developer = scrapy.Field(name=None). How can I go about this?
Scrapy implements fields internally as dicts, but this does not mean they should be accessed as dicts. When you call item['developer'], what you are really doing is getting the value of the field, not the field itself. So, if the value has not been set yet, this will throw a KeyError.
Considering this, there are two ways you could go about your problem.
First one, just set the developer field value to a dict:
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
item['developer'] = {'name': response.xpath('//h3[#title]/a/text()').extract_first()}
return item
Second one, create a new Developer class and set the developer value to be an instance of this class:
# this can go to items.py
class Developer(scrapy.Item):
name = scrapy.Field()
def parse(self, response):
item = ApkmirrorScraperItem()
item['url'] = response.url
item['title'] = response.xpath('//h1[#title]/text()').extract_first()
dev = Developer()
dev['name'] = response.xpath('//h3[#title]/a/text()').extract_first()
item['developer'] = dev
return item
Hope it helps :)
my Scrapy crawler is working fine, currently he is crawling some tables, but on some website there are not all information on hand which I like to insert into my mysql table.
So I thought about adding them myself, because on those websites the information is for those fields the same, but I am not sure how to populate them in the spider.
Sure, I could determine the length of one of the lists in the pipeline and then use a while loop to add for example USA in the item['country'] list but I want to do the same in the spider.
I would apppreciate some help, thank you.
Current spider code for populating lists:
def parse(self, response):
for sel in response.xpath('//div[#class="pagecontainer"]'):
item = EbayItem()
item['id'] = sel.xpath('div[2]/text()[2]').extract()
item['user'] = sel.xpath('tr/td[2]/text()[1]').extract()
item['string'] = sel.xpath ('tr/td[2]/a/text()').extract()
item['state'] = sel.xpath('tr/td[3]/b[3]/text()').extract()
item['country'] = sel.xpath('tr/td[3]/b[1]/text()').extract()
item['weight'] = sel.xpath('tr/td[3]/b[2]/text()').extract()
item['position'] = sel.xpath('tr/td[4]/text()').re(r'[0-9,\-]+')
item['old'] = sel.xpath('tr/td[5]/text()').extract()
item['datetime'] = sel.xpath('tr/td[6]/text()').re('[0-9]{2}.[0-9]{2}.[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}')
yield item
Greetings
P.Halmsich
You want to add things in MySQL. This means that your fields shouldn't be arrays (e.g. ['my-value']) but scalars (e.g. 'my-value'). The easiest way to do this is by using extract_first() instead of extract().
extract_first() allows you to set default values like this: .extract_first(default='my-default-value') or just .extract_first('my-default-value')
Cheers
You can always check the scraped item for empty results using if-else statement. Try the code below:
def parse(self, response):
for sel in response.xpath('//div[#class="pagecontainer"]'):
item = EbayItem()
item['id'] = sel.xpath('div[2]/text()[2]').extract()
item['user'] = sel.xpath('tr/td[2]/text()[1]').extract()
item['string'] = sel.xpath ('tr/td[2]/a/text()').extract()
item['state'] = sel.xpath('tr/td[3]/b[3]/text()').extract()
item['country'] = sel.xpath('tr/td[3]/b[1]/text()').extract()
if item['country'] == []:
item['country'] = 'USA'
item['weight'] = sel.xpath('tr/td[3]/b[2]/text()').extract()
item['position'] = sel.xpath('tr/td[4]/text()').re(r'[0-9,\-]+')
item['old'] = sel.xpath('tr/td[5]/text()').extract()
item['datetime'] = sel.xpath('tr/td[6]/text()').re('[0-9]{2}.[0-9]{2}.[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}')
yield item
I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.
So I have scrapy working really well. It's grabbing data out of a page, but the problem I'm running into is that sometimes the page's table order is different. For example, the first page it gets to:
Row name Data
Name 1 data 1
Name 2 data 2
The next page it crawls to might have the order completely different. Where Name 1 was the first row, any other page it might be the 3rd, or 4th etc. The row names are always the same. I was thinking of doing this possibly 1 of 2 different ways, I'm not sure which will work or which is better.
First option, use some if statements to find the row I need, and then grab the following column. This seems a little messy but could work.
Second option, grab all the data in the table regardless of order and put it in a dict. This way, I can grab the data I need based on row name. This seems like the cleanest approach.
Is there a 3rd option or a better way of doing either?
Here's my code in case it's helpful.
class pageSpider(Spider):
name = "pageSpider"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/stuffs/results",
]
visitedURLs = Set()
def parse(self, response):
products = Selector(response).xpath('//*[#class="itemCell"]')
for product in products:
item = PageScraper()
item['url'] = product.xpath('div[2]/div/a/#href').extract()[0]
urls = Set([product.xpath('div[2]/div/a/#href').extract()[0]])
print urls
for url in urls:
if url not in self.visitedURLs:
request = Request(url, callback=self.productpage)
request.meta['item'] = item
yield request
def productpage(self, response):
specs = Selector(response).xpath('//*[#id="Specs"]')
item = response.meta['item']
for spec in specs:
item['make'] = spec.xpath('fieldset[1]/dl[1]/dd/text()').extract()[0].encode('utf-8', 'ignore')
item['model'] = spec.xpath('fieldset[1]/dl[4]/dd/text()').extract()[0].encode('utf-8', 'ignore')
item['price'] = spec.xpath('fieldset[2]/dl/dd/text()').extract()[0].encode('utf-8', 'ignore')
yield item
The xpaths in productpage can contain data that doesn't correspond to what I need, because the order changed.
Edit:
I'm trying the dict approach and I think this is the best option.
def productpage(self, response):
specs = Selector(response).xpath('//*[#id="Specs"]/fieldset')
itemdict = {}
for i in specs:
test = i.xpath('dl')
for t in test:
itemdict[t.xpath('dt/text()').extract()[0]] = t.xpath('dd/text()').extract()[0]
item = response.meta['item']
item['make'] = itemdict['Brand']
yield item
This seems like the best and cleanest approach (using dict)
def productpage(self, response):
specs = Selector(response).xpath('//*[#id="Specs"]/fieldset')
itemdict = {}
for i in specs:
test = i.xpath('dl')
for t in test:
itemdict[t.xpath('dt/text()').extract()[0]] = t.xpath('dd/text()').extract()[0]
item = response.meta['item']
item['make'] = itemdict['Brand']
yield item