duplicated data scraper json api - python
I have this script:
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line:
for item in range(0,576,32):
Thanks in advance
You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not.
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
#'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Proven by set()
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
unique_data = set()
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
s=result['productReference']
self.unique_data.add(s)
yield {
#'Casa':'Just_For_Sports',
'Sku' :s,
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Output:
'item_scraped_count': 576,
Related
scrapy pagination is duplicating lines and stops after 9k rows
how are you? I created this code, but some lines are being duplicated. For example, out of 9k rows, 3k is doubled or tripled. And after 9.112 I get a "not allowed". Is it some glueapi restriction? This code runs up to 9,112 lines and still comes with 30% of repeated lines. Does anyone know how I can fix this? import scrapy import json from scrapy.exceptions import CloseSpider class ImoveisSpider(scrapy.Spider): name = 'teste' def start_requests(self): yield scrapy.Request( url = 'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from=0&page=1&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication', headers = { 'x-domain': 'www.zapimoveis.com.br' }, callback=self.parse ) prm = 0 page = 1 def parse(self, response): if len(json.loads(response.body)) == 0: raise CloseSpider('No more products to scrape...') resp = json.loads(response.body) listings = resp.get('search').get('result').get('listings') for info in listings: yield { 'link': info.get('link').get('href'), 'city': info.get('link').get('data').get('city') } self.prm += 34 self.page += 1 yield scrapy.Request( url=f'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from={self.prm}&page={self.page}&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication', headers = { 'x-domain': 'www.zapimoveis.com.br' }, callback=self.parse )
Scrapy tracking and scraping third page
after trying to add third page to this shenanigas i got an error "You can't mix str and non-str arguments". My goal is to use url from 'website' and scrap data from it. How do i do it? Here is my code: # -*- coding: utf-8 -*- import scrapy from scrapy.crawler import CrawlerProcess from scrapy import Request, Spider class RynekMainSpider(scrapy.Spider): name = "RynekMain" start_urls = [ 'https://rynekpierwotny.pl/deweloperzy/?page=1'] def parse(self, response): websites = response.css('div#root')[0] PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get() for website in websites.css('li.rp-np9kb1'): page = website.css('a::attr(href)').get() address = website.css('address.rp-o9b83y::text').get() name = website.css('h2.rp-69f2r4::text').get() params = { 'address' : address, 'name' : name, 'href' : page, } url = response.urljoin(page) urlem = response.urljoin(website) yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain) yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail) yield Request(url=response.urljoin(PAGETEST), callback=self.parse) def parseMain(self, response, params=None): # print(response.url) website = response.css('div.rp-l0pkv6 a::attr(href)').get() params['website'] = website yield params def parseEmail(self,response, params=None): hps = HtmlXPathSelector(response) email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)') if __name__ == "__main__": process =CrawlerProcess() process.crawl(RynekMainSpider) process.start() Thanks for help in advance.
A simple debugging pointed me to the error line: urlem = response.urljoin(website) # You can't mix str and non-str arguments website is a Selector, and urljoin needs a string. Perhaps what you are looking for is this: urlem = response.urljoin(website.xpath('.//a/#href').get())
Ok i solved it. I just moved yield a bit. Yield can't just take non existent strings, string needs to be created first, that's why i got problems before. Website url was scraped in parseMain not in parse. # -*- coding: utf-8 -*- import scrapy from scrapy.crawler import CrawlerProcess from scrapy import Request, Spider class RynekMainSpider(scrapy.Spider): name = "RynekMain" start_urls = [ 'https://rynekpierwotny.pl/deweloperzy/?page=1'] def parse(self, response): websites = response.css('div#root')[0] PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get() for website in websites.css('li.rp-np9kb1'): page = website.css('a::attr(href)').get() address = website.css('address.rp-o9b83y::text').get() name = website.css('h2.rp-69f2r4::text').get() params = { 'address' : address, 'name' : name, 'href' : page, } url = response.urljoin(page) yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain) yield Request(url=response.urljoin(PAGETEST), callback=self.parse) def parseMain(self, response, params=None): # print(response.url) website = response.css('div.rp-l0pkv6 a::attr(href)').get() params['website'] = website urlem = response.urljoin(website) yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail) def parseEmail(self,response, params=None): email = response.css('div.m-Footer__company a::attr(href)').get() params['email'] = email yield params if __name__ == "__main__": process =CrawlerProcess() process.crawl(RynekMainSpider) process.start()
Scraping multiple pages with multiple start_urls
I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so. import scrapy from scrapy.http import Request BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true" ] class ChangeSpider(scrapy.Spider): name = 'change' def start_requests(self): for i in range(len(BASE_URL)): yield Request(BASE_URL[i], callback = self.parse) pageNumber = 11 def parse(self, response): data = response.json() for item in range(len(data['items'])): yield { "petition_id": data['items'][item]['petition']['id'], } next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true" if data['last_page'] == False: ChangeSpider.pageNumber += 1 yield response.follow(next_page, callback=self.parse)
Try like this: import scrapy from scrapy.http import Request class ChangeSpider(scrapy.Spider): name = 'change' start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true" ] pageNumber = 11 def parse(self, response): data = response.json() for item in range(len(data['items'])): yield { "petition_id": data['items'][item]['petition']['id'], } next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true" if data['last_page'] == False: ChangeSpider.pageNumber += 1 yield response.follow(next_page, callback=self.parse)
Access multiple pages with pagination in Scrapy
I have urls with multiple pages. I try to paginate to extract datas from theses urls hut it works only one time (juste one next_page). What's wrong ? import json import scrapy import re import pkgutil from scrapy.loader import ItemLoader from rzc_spider.items import AnnonceItem class AnnonceSpider(scrapy.Spider): name = 'rzc_results' def __init__(self, *args, **kwargs): data_file = pkgutil.get_data("rzc_spider", "json/input/test_tt.json") self.data = json.loads(data_file) def start_requests(self): for item in self.data: request = scrapy.Request(item['rzc_url'], callback=self.parse) request.meta['item'] = item yield request def parse(self, response): item = response.meta['item'] item['results'] = [] item["car_number"] = response.css( "h2.sub::text").extract_first() for caritem in response.css("div.ad > div[itemtype='https://schema.org/Vehicle']"): data = AnnonceItem() #model data["model"] = caritem.css( "em.title::text").extract_first() item['results'].append(data) yield item next_page = response.css( 'a.link::attr(href)').extract_first() if next_page is not None: url_pagination = 'https://www.websiteexample.com' + next_page meta = {'item': response.meta['item']} yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta) #ban proxies reaction def response_is_ban(self, request, response): return b'banned' in response.body def exception_is_ban(self, request, exception): return None The json file with the url (a sample in this case): [{ "rzc_url": "https://www.websiteexample.com/model" }]
Try and check the URL. Sometimes they set traps so only next_page has a absolute URL and another one has a relative URL. Instead of combining url_pagination with next_page use urljoin. Import it yield scrapy.Request(urljoin(response.url, item), callback=self.parse, meta=meta)
Scrapy yeild items from multiple requests
I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each import scrapy from scrapy.http import Request, FormRequest from ..items import PrintersItem from scrapy.utils.response import open_in_browser class PrinterSpider(scrapy.Spider): name = 'printers' start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ] def parse(self, response): items = PrintersItem() token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first() print(token) yield FormRequest.from_response(response, formnumber=1, formdata={ 'CSRFToken' : token, 'B55d' : 'password', 'loginurl' : '/general/status.html' }, callback=self.postlogin2) def postlogin2(self,response): items = PrintersItem() contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract() location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract() items['contact'] = contact items['location'] = location yield Request( url = response.url.split('/general')[0] + "/general/information.html?kind=item", callback=self.action) for items in self.postlogin2(response): yield items def action(self,response): drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract() items['drum'] = drum print(drum) printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract() items['printermodel'] = printermodel yield Request( url = response.url.split('/general')[0] + "/net/wired/tcpip.html", callback=self.action2) for items in self.action(response): yield items def action2(self, response): tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract() items['tcpip'] = tcpip for items in self.action2(response): yield items
If you want to send items from parse to postlogin2, etc. then add it as meta data in Request yield Request( ..., meta={"items": items}) and get it in other function items = response.meta["items"] and yield it only in the last function yield items Doc: Request and Response, Request.meta special keys class PrinterSpider(scrapy.Spider): name = 'printers' start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ] def parse(self, response): token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first() print(token) yield FormRequest.from_response(response, formnumber=1, formdata={ 'CSRFToken' : token, 'B55d' : 'password', 'loginurl' : '/general/status.html' }, callback=self.postlogin2) def postlogin2(self, response): items = PrintersItem() contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract() location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract() items['contact'] = contact items['location'] = location yield Request( #url=response.urljoin("/general/information.html?kind=item"), url=response.url.split('/general')[0] + "/general/information.html?kind=item", callback=self.action, meta={"items": items}) def action(self, response): items = response.meta["items"] drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract() items['drum'] = drum print(drum) printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract() items['printermodel'] = printermodel yield Request( #url=response.urljoin("/net/wired/tcpip.html"), url=response.url.split('/general')[0] + "/net/wired/tcpip.html", callback=self.action2, meta={"items": items}) def action2(self, response): items = response.meta["items"] tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract() items['tcpip'] = tcpip yield items