I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.
Related
I'm trying to create a spider from the results of this page
When I call the runspider function on the scraped results I get zero crawled pages returned.
There are further errors that I do not understand. I'm a beginner and this project is new to me.
My code it is:
`
import scrapy
class SusSpider(scrapy.Spider):
name = 'susManagement'
allowed_domains = ['in.gov.br/']
start_urls = ['https://www.in.gov.br/consulta/-/buscar/dou?q=*&s=do1&s=doe&exactDate=personalizado&sortType=0&delta=20&publishFrom=01%2F10%2F2021&publishTo=31%2F12%2F2021&orgPrin=Minist%C3%A9rio+da+Sa%C3%BAde']
def parse(self, response):
gates = response.xpath("//div[#class='resultado']//h5[#class='title-marker']//a")
for gate in gates:
gate_number = gate.xpath(".//text()").get()
link_gate = gate.xpath(".//#href").get()
yield response.follow(url=link_gate, callback=self.parse_text,
meta={'gate_name': gate_number})
next_page = response.xpath("//div//ul/li[#class='page-item active']/button")
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_text(self, response):
portaria = response.request.meta['gate_name']
num_portaria = response.xpath("*//section//div//p[#class='identifica']/text()").re('.*')
texto = response.xpath("//section//div//p[#class='texto-dou']/text()").re('.*')
ementa = response.xpath("//article//div//p[#class='ementa']/text()").re('.*')
rest_texto = texto - ementa - num_portaria
yield {
'port_name': portaria,
'numero_port': num_portaria,
'classified': ementa,
'texto_integral': rest_texto
}
`
I have tried to change the sequence of my parse functions. By clicking on each file after its finished.
At the moment I want to create a csv file with the documents of the output and separate them by columns.
I Have an issue with my Spider. I tried to follow some tutorial to understand the scrapy a little bit better and extended the tutorial to crawl also subpages. The issue of my spider is that it only crawls one element of the entry page and not 25 as it should be on the page.
I have no clue where the failure is. Perhaps somebody of you can help me here:
from datetime import datetime as dt
import scrapy
from reddit.items import RedditItem
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['reddit.com']
def start_requests(self):
reddit_urls = [
('datascience', 'week')
]
for sub, period in reddit_urls:
url = 'https://www.reddit.com/r/' + sub + '/top/?sort=top&t=' + period
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# get the subreddit from the URL
sub = response.url.split('/')[4]
# parse thru each of the posts
for post in response.css('div.thing'):
item = RedditItem()
item['title'] = post.css('a.title::text').extract_first()
item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
### scrap comments page.
request = scrapy.Request(url=item['commentsUrl'], callback=self.parse_comments)
request.meta['item'] = item
return request
def parse_comments(self, response):
item = response.meta['item']
item['commentsText'] = response.css('div.comment div.md p::text').extract()
self.logger.info('Got successful response from {}'.format(response.url))
yield item
Thanks for your help.
BR
Thanks for your comments:
Indeed I have to yield request it, rather return request.
Now it is working.
I'm trying to create web-scraper for dynamic website. For this purpose I'm using Scrapy 1.2.1 and scrapy-splash 0.7 library.
The problem appears when using splash server, most of the times, it returns different data to scrapy. From log I can see that all pages are crawled. If I use scrapy.Request instead of SplashRequest, everything is OK (I get the same data each time).
My code:
import scrapy
import re
from scrapy_splash import SplashRequest
from scraper.items import ScraperRozetka
class RozetkaSpider(scrapy.Spider):
name = "rozetka_laptops"
start_urls = [
'http://rozetka.com.ua/notebooks/c80004/filter/producer=dell;page=1/',
]
def parse(self, response):
last_page = response.xpath('//ul[#name="paginator"]//li[last()]//#id').extract_first()
last_page_num = int(last_page[-1])
i = 1
while i <= last_page_num:
url = re.sub(r'page=\d+', r'page={}'.format(i), response.url)
i += 1
yield SplashRequest(url, self.parse_results, endpoint='render.html', args={'wait': 0.5, 'timeout': 60})
def parse_results(self, response):
items = []
records = response.css('div.g-i-tile-catalog')
for record in records:
item = ScraperRozetka()
item['title'] = record.css('img::attr(title)').extract_first()
item['price'] = record.css('div.g-price-uah::text').extract_first()
item['link'] = record.css('div.g-i-tile-i-title a::attr(href)').extract_first()
items.append(item)
return items
Would be grateful if someone helped me.
Thanks.
In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.
I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()