duplicated data scraper json api

duplicated data scraper json api - python

I have this script:
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line:
for item in range(0,576,32):
Thanks in advance

You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not.
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
#'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Proven by set()
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
unique_data = set()
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
s=result['productReference']
self.unique_data.add(s)
yield {
#'Casa':'Just_For_Sports',
'Sku' :s,
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Output:
'item_scraped_count': 576,

Related

scrapy pagination is duplicating lines and stops after 9k rows

how are you?
I created this code, but some lines are being duplicated. For example, out of 9k rows, 3k is doubled or tripled. And after 9.112 I get a "not allowed". Is it some glueapi restriction?
This code runs up to 9,112 lines and still comes with 30% of repeated lines. Does anyone know how I can fix this?
import scrapy
import json
from scrapy.exceptions import CloseSpider
class ImoveisSpider(scrapy.Spider):
name = 'teste'
def start_requests(self):
yield scrapy.Request(
url = 'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from=0&page=1&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)
prm = 0
page = 1
def parse(self, response):
if len(json.loads(response.body)) == 0:
raise CloseSpider('No more products to scrape...')
resp = json.loads(response.body)
listings = resp.get('search').get('result').get('listings')
for info in listings:
yield {
'link': info.get('link').get('href'),
'city': info.get('link').get('data').get('city')
}
self.prm += 34
self.page += 1
yield scrapy.Request(
url=f'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from={self.prm}&page={self.page}&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)

Scrapy tracking and scraping third page

after trying to add third page to this shenanigas i got an error "You can't mix str and non-str arguments". My goal is to use url from 'website' and scrap data from it. How do i do it?
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
urlem = response.urljoin(website)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
def parseEmail(self,response, params=None):
hps = HtmlXPathSelector(response)
email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.

A simple debugging pointed me to the error line:
urlem = response.urljoin(website) # You can't mix str and non-str arguments
website is a Selector, and urljoin needs a string.
Perhaps what you are looking for is this:
urlem = response.urljoin(website.xpath('.//a/#href').get())

Ok i solved it.
I just moved yield a bit.
Yield can't just take non existent strings, string needs to be created first,
that's why i got problems before.
Website url was scraped in parseMain not in parse.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.css('div.m-Footer__company a::attr(href)').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()

Scraping multiple pages with multiple start_urls

I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Access multiple pages with pagination in Scrapy

I have urls with multiple pages. I try to paginate to extract datas from theses urls hut it works only one time (juste one next_page). What's wrong ?
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from rzc_spider.items import AnnonceItem
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data("rzc_spider", "json/input/test_tt.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
item["car_number"] = response.css(
"h2.sub::text").extract_first()
for caritem in response.css("div.ad > div[itemtype='https://schema.org/Vehicle']"):
data = AnnonceItem()
#model
data["model"] = caritem.css(
"em.title::text").extract_first()
item['results'].append(data)
yield item
next_page = response.css(
'a.link::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.websiteexample.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)
#ban proxies reaction
def response_is_ban(self, request, response):
return b'banned' in response.body
def exception_is_ban(self, request, exception):
return None
The json file with the url (a sample in this case):
[{
"rzc_url": "https://www.websiteexample.com/model"
}]

Try and check the URL. Sometimes they set traps so only next_page has a absolute URL and another one has a relative URL. Instead of combining url_pagination with next_page use urljoin. Import it
yield scrapy.Request(urljoin(response.url, item), callback=self.parse, meta=meta)

Scrapy yeild items from multiple requests

I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items

If you want to send items from parse to postlogin2, etc. then add it as meta data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
yield items

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

duplicated data scraper json api - python

Related

scrapy pagination is duplicating lines and stops after 9k rows

Scrapy tracking and scraping third page

Scraping multiple pages with multiple start_urls

Access multiple pages with pagination in Scrapy

Scrapy yeild items from multiple requests

Categories

Resources