I am creating a scraper for web data scraping.
There are 58 pages and each page has 12 products.
The data should be return as 58 x 12 = 696 products title but it return only data of 404 products only. Here is my code
import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
npages = 57
# This mimics getting the pages using the next button.
for i in range(2, npages + 1):
start_urls.append("https://perfumehut.com.pk/shop/page/"+str(i)+"")
def parse(self, response):
for href in response.xpath("//h3[contains(#class, 'product-title')]/a/#href"):
# add the scheme, eg http://
url = "" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = FundrazrItem()
# Getting Campaign Title
item['campaignTitle'] = response.xpath("//h1[contains(#class, 'entry-title')]/text()").extract()
yield item
Its a woocommerce website and first page is
https://perfumehut.com.pk/shop/
and other pages as pagination
https://perfumehut.com.pk/shop/page/2/
https://perfumehut.com.pk/shop/page/3/
and up to 58.
I want to know what I did wrong by getting npages ?
Regards
import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
def parse(self, response):
data = FundrazrItem()
for item in response.xpath("//div[contains(#class, 'products elements-grid ')]/div[contains(#class, 'product-grid-item product ')]/h3/a"):
data['campaignTitle'] = item.xpath("./text()").extract_first()
yield data
next_page = response.xpath("//ul[#class='page-numbers']/li[last()]/a/#href").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, callback=self.parse)
Related
I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]
I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
I Have an issue with my Spider. I tried to follow some tutorial to understand the scrapy a little bit better and extended the tutorial to crawl also subpages. The issue of my spider is that it only crawls one element of the entry page and not 25 as it should be on the page.
I have no clue where the failure is. Perhaps somebody of you can help me here:
from datetime import datetime as dt
import scrapy
from reddit.items import RedditItem
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['reddit.com']
def start_requests(self):
reddit_urls = [
('datascience', 'week')
]
for sub, period in reddit_urls:
url = 'https://www.reddit.com/r/' + sub + '/top/?sort=top&t=' + period
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# get the subreddit from the URL
sub = response.url.split('/')[4]
# parse thru each of the posts
for post in response.css('div.thing'):
item = RedditItem()
item['title'] = post.css('a.title::text').extract_first()
item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
### scrap comments page.
request = scrapy.Request(url=item['commentsUrl'], callback=self.parse_comments)
request.meta['item'] = item
return request
def parse_comments(self, response):
item = response.meta['item']
item['commentsText'] = response.css('div.comment div.md p::text').extract()
self.logger.info('Got successful response from {}'.format(response.url))
yield item
Thanks for your help.
BR
Thanks for your comments:
Indeed I have to yield request it, rather return request.
Now it is working.
I want to make the crawler go to the next page to extract data any help on what to do. I am a little lost on what to do. I tried scrapy but it is kinda complicated and bs4 is more convenient.
import bs4 as bs
import urllib.request
import pandas as pd
import re
source = urllib.request.urlopen('https://messageboards.webmd.com/').read()
soup = bs.BeautifulSoup(source,'lxml')
df = pd.DataFrame(columns = ['link'],data=[url.a.get('href') for url in soup.find_all('div',class_="link")])
lists=[]
for i in range(0,33):
link = (df.link.iloc[i])
source1 = urllib.request.urlopen(link).read()
soup1 = bs.BeautifulSoup(source1,'lxml')
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
So it looks like you're storing hrefs in a list
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
Now you actually have to do something with them. In this case I'm assuming you want to navigate to each href in your list.
for href in lists:
new_page = urllib.request.urlopen(href).read()
And then you can scrape whatever data you want out of new_page
I've got the same problem. Here is my code example for a page I crawled for exercise. I've chained multiple site requests to get detailed information.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from capterra.items import CapterraItem
class CapterraCatSpider(CrawlSpider):
name = 'capterra_cat'
#allowed_domains = ['http://www.capterra.com/categories']
start_urls = ['http://www.capterra.com/categories']
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
#TEMP
for category in response.css('ol.browse-group-list'):
#Debug: only elements of one category
if category.css('a::text').extract_first() == 'Yoga Studio':
i = CapterraItem()
#Get link to detail page
i['cat_name'] = category.css('a::text').extract_first()
#join link to detail page with base url
i['cat_link'] = response.urljoin(category.css('a::attr(href)').extract_first())
cat_link = i['cat_link']
print cat_link
#call request to detail page and pass response to parse_details method with callback method
request = scrapy.Request(cat_link, callback=self.parse_details)
request.meta['item'] = i
yield request
def parse_details(self,response):
#Debug print
print 'DETAILS!'
#read your items from response meta
item = response.meta['item']
#iterate over listings
for detail in response.css('p.listing-description.milli'):
item['profile_link'] = response.urljoin(detail.css('a.spotlight-link::attr(href)').extract_first())
#call request to profile page to get more information for listing
request = scrapy.Request(item['profile_link'], callback=self.parse_profile)
#set your item to rquest metadata
request.meta['item'] = item
yield request
def parse_profile(self,response):
#Debug print
print 'PROFILE'
item = response.meta['item']
item['product_name'] = response.css('h1.beta.no-margin-bottom::text').extract_first()
item['who_uses_software'] = response.css('div.spotlight-target > p.epsilon > i::text').extract_first()
item['vendor_name'] = response.css('h2.spotlight-vendor-name > span::text').extract_first()
return item
I am trying to make a scrapy bot that utilizes pagination but having no success...
The bot crawls through all of the links on the first page one but never goes on to the next page. I have read a ton of different threads and I cant figure this out at all. I am very new to web scraping to please feel free to hammer the crap out of my code.
import time
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http.request import Request
from tutorial.items import TutorialItem
#from scrapy_tutorial.items import ScrapyTutorialItem
class raytheonJobsPageSpider(CrawlSpider):
name = "raytheonJobsStart"
allowed_domains = ["jobs.raytheon.com"]
start_urls = [
"https://jobs.raytheon.com/search-jobs"
]
rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[#class="next"]',)), callback='parse_listings',follow=True), )
def parse_start_url(self, response):
'''
Crawl start URLs
'''
return self.parse_listings(response)
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
'''
Extract data from details pages
'''
sel = Selector(response)
job = sel.xpath('//*[#id="content"]')
item = TutorialItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
jobTitle=job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
item['reqid'] = job.xpath('//*[#id="content"]/section[1]/div/span[1]/text()').extract()
item['location'] = job.xpath('//*[#id="content"]/section[1]/div/span[last()]/text()').extract()
item['applink'] = job.xpath('//*[#id="content"]/section[1]/div/a[2]/#href').extract()
item['description'] = job.xpath('//*[#id="content"]/section[1]/div/div').extract()
item['clearance'] = job.xpath('//*[#id="content"]/section[1]/div/*/text()').extract()
#item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
time.sleep(1)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
print self,value
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value
You dont need PhantomJS or Splash.
By inspecting the AJAX calls I found that they are loading jobs via AJAX calls to this URL
You can see CurrentPage parameter at the end of URL.
And the result is returned in JSON format, and all jobs are on the key named results
I created a project on my side and I created fully 100% working code for you. Here is link to that in github, just download and run it ... you dont have to do anything at all :P
Download whole working project fomr here https://github.com/mani619cash/raytheon_pagination
Basic logic is here
class RaytheonspiderSpider(CrawlSpider):
name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="
def start_requests(self):
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
def parse_listings(self, response):
resp = json.loads(response.body)
response = Selector(text = resp['results'])
jobs = response.xpath('//*[#id="search-results-list"]/ul/*/a/#href').extract()
if jobs:
for job_url in jobs:
job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
#job_url = self.__to_absolute_url(response.url, job_url)
yield Request(url=job_url, callback=self.parse_details)
else:
raise CloseSpider("No more pages... exiting...")
# go to next page...
self.page = self.page + 1
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
Change
restrict_xpaths=('//div[#class="next"]',)) to
restrict_xpaths=('//a[#class="next"]',))
If this not working then do a recursive call to parse_listings function
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
yield Request(pagination link here, callback=self.parse_listings)
I am on mobile so cant type code. I hope the logic i told you makes sense