Spider not following links - scrapy - python

I am trying to build a spider which follows through 3 pages before getting to the page it scrapes. I have tested the responses in the shell, however, together it doesn't seem to work and I am not sure why.
My code below:
# -*- coding: utf-8 -*-
import scrapy
class CollegiateSpider(scrapy.Spider):
name = 'Collegiate'
allowed_domains = ['collegiate-ac.com/uk-student-accommodation']
start_urls = ['http://collegiate-ac.com/uk-student-accommodation/']
# Step 1 - Get the area links
def parse(self, response):
for city in response.xpath('//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/text').extract():
yield scrapy.Request(response.urljoin("/" + city), callback = self.parse_area_page)
# Step 2 - Get the block links
def parse_area_page(self, response):
for url in response.xpath('//div[3]/div/div/div/a/#href').extract():
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3 Get the room links
def parse_unitpage(self, response):
for url in response.xpath('//*[#id="subnav"]/div/div[2]/ul/li[5]/a/#href').extract():
yield scrapy.Request(response.urljoin(final), callback=self.parse_final)
# Step 4 - Scrape the data
def parse_final(self, response):
pass
I have tried changing to Crawlspider as per this answer, but that didn't seem to help.
I am currently looking into how to debug spiders, however, struggling with that so thought it would be beneficial to get opinions on here as well.

You forgot () in text() in '//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/text()'
But instead of text() I use #href to get url.
Joining urljoin('/' + city) creates wrong url because / skips /uk-student-accommodation - you have to use urljoin(city)
There was problem with allowed_domains - it blocked most of urls.
Working example. You can run it without project and it saves final urls in output.csv
import scrapy
class CollegiateSpider(scrapy.Spider):
name = 'Collegiate'
allowed_domains = ['collegiate-ac.com']
start_urls = ['https://collegiate-ac.com/uk-student-accommodation/']
# Step 1 - Get the area links
def parse(self, response):
for url in response.xpath('//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/#href').extract():
url = response.urljoin(url)
#print('>>>', url)
yield scrapy.Request(url, callback=self.parse_area_page)
# Step 2 - Get the block links
def parse_area_page(self, response):
for url in response.xpath('//div[3]/div/div/div/a/#href').extract():
url = response.urljoin(url)
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3 Get the room links
def parse_unitpage(self, response):
for url in response.xpath('//*[#id="subnav"]/div/div[2]/ul/li[5]/a/#href').extract():
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_final)
# Step 4 - Scrape the data
def parse_final(self, response):
# show some information for test
print('>>> parse_final:', response.url)
# send url as item so it can save it in file
yield {'final_url': response.url}
# --- run it without project ---
import scrapy.crawler
c = scrapy.crawler.CrawlerProcess({
"FEED_FORMAT": 'csv',
"FEED_URI": 'output.csv'
})
c.crawl(CollegiateSpider)
c.start()

Related

Scrapy: Get specific part of a URL before redirection

Here's the code I'll be working with (I'm using scrapy)
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
This is where I'm storing all my URLS
Here is how I'm trying to only print everything after the '='
productSKU = response.url.split("=")[-1]
item["productSKU"] = productSKU
Here is the output:
{'productPrice': '1,449.95',
'productSKU': 'https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644'}
So now here's the problem:
The URLs I'm inputting will eventually be populated with
https://www.lowes.com/search?searchTerm = {something}
and that's why I would like to use {something} to ensure I'll have every item that I attempted to scrape on the CSV (for sorting and matching purposes).
The URL I'm using redirects to me this URL:
(Input)https://www.lowes.com/search?searchTerm=8654RM-42
->
(Redirect) https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644
And so, my output for productSKU is the entire redirect URL instead of just whatever is after the '=' sign. The output I would like would be 8654RM-42.
And here is my whole program
# -*- coding: utf-8 -*-
import scrapy
from ..items import LowesspiderItem
from scrapy.http import Request
class LowesSpider(scrapy.Spider):
name = 'lowes'
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, cookies={'sn':'2333'}) #Added cookie to bypass location req
def parse(self, response):
items = response.css('.grid-container')
for product in items:
item = LowesspiderItem()
#get product price
productPrice = product.css('.art-pd-price::text').get()
productSKU = response.url.split("=")[-1]
item["productSKU"] = productSKU
item["productPrice"] = productPrice
yield item
you need to use meta to pass in the input url like this
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, cookies={'sn':'2333'},meta={'url':url)
def parse(self,response):
url = response.meta['url'] #your input url

Scrapy Crawl Page and Supage but crawls only one item

I Have an issue with my Spider. I tried to follow some tutorial to understand the scrapy a little bit better and extended the tutorial to crawl also subpages. The issue of my spider is that it only crawls one element of the entry page and not 25 as it should be on the page.
I have no clue where the failure is. Perhaps somebody of you can help me here:
from datetime import datetime as dt
import scrapy
from reddit.items import RedditItem
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['reddit.com']
def start_requests(self):
reddit_urls = [
('datascience', 'week')
]
for sub, period in reddit_urls:
url = 'https://www.reddit.com/r/' + sub + '/top/?sort=top&t=' + period
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# get the subreddit from the URL
sub = response.url.split('/')[4]
# parse thru each of the posts
for post in response.css('div.thing'):
item = RedditItem()
item['title'] = post.css('a.title::text').extract_first()
item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
### scrap comments page.
request = scrapy.Request(url=item['commentsUrl'], callback=self.parse_comments)
request.meta['item'] = item
return request
def parse_comments(self, response):
item = response.meta['item']
item['commentsText'] = response.css('div.comment div.md p::text').extract()
self.logger.info('Got successful response from {}'.format(response.url))
yield item
Thanks for your help.
BR
Thanks for your comments:
Indeed I have to yield request it, rather return request.
Now it is working.

How to do multiple page scraping using Scrapy?

#----\
#-----*-----\
#----/ \
\
#----\ \
#-----*-------- * <-- START
#----/ /
/
#----\ /
#-----*-----/
#----/
Here is a structure of a website I want to scrap with scrapy, where * is a page and --- indicates link. I want to scrape data of # pages.
I have already done a scraper which can scrape data from a single # page.
import scrapy
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/ency/article/000178.htm']
def parse(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
The starting webpage is https://medlineplus.gov/encyclopedia.html
I want to scrape info about all diseases in the encyclopedia.
You would need to start with the "encyclopedia.html" page, follow the "alpha" links (the A-Z articles links), then, for every followed page, follow the links to the articles.
You can do this with a CrawlSpider and the Link Extractors, but, since the crawling depth is small, we can do this with a regular Spider:
from urlparse import urljoin # Python 2 only
import scrapy
from scrapy.http import Request
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class MedicalSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/encyclopedia.html']
def parse(self, response):
for link in response.css("ul.alpha-links li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_alpha_page)
def parse_alpha_page(self, response):
for link in response.css("ul#index li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_page)
def parse_page(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
Note that it looks like there is a better way to get the desired data from the MedlinePlus (check out the "For Developers" page).

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Scrapy Spider just crawls and does not scrape

I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.

Categories

Resources