scrapy pagination is duplicating lines and stops after 9k rows

scrapy pagination is duplicating lines and stops after 9k rows - python

how are you?
I created this code, but some lines are being duplicated. For example, out of 9k rows, 3k is doubled or tripled. And after 9.112 I get a "not allowed". Is it some glueapi restriction?
This code runs up to 9,112 lines and still comes with 30% of repeated lines. Does anyone know how I can fix this?
import scrapy
import json
from scrapy.exceptions import CloseSpider
class ImoveisSpider(scrapy.Spider):
name = 'teste'
def start_requests(self):
yield scrapy.Request(
url = 'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from=0&page=1&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)
prm = 0
page = 1
def parse(self, response):
if len(json.loads(response.body)) == 0:
raise CloseSpider('No more products to scrape...')
resp = json.loads(response.body)
listings = resp.get('search').get('result').get('listings')
for info in listings:
yield {
'link': info.get('link').get('href'),
'city': info.get('link').get('data').get('city')
}
self.prm += 34
self.page += 1
yield scrapy.Request(
url=f'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from={self.prm}&page={self.page}&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)

Related

duplicated data scraper json api

I have this script:
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line:
for item in range(0,576,32):
Thanks in advance

You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not.
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
#'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Proven by set()
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
unique_data = set()
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
s=result['productReference']
self.unique_data.add(s)
yield {
#'Casa':'Just_For_Sports',
'Sku' :s,
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Output:
'item_scraped_count': 576,

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()

You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

Scraping multiple pages with multiple start_urls

I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Scrapy yeild items from multiple requests

I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items

If you want to send items from parse to postlogin2, etc. then add it as meta data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
yield items

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}

I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrapy pagination is duplicating lines and stops after 9k rows - python

Related

duplicated data scraper json api

Extracting next page and setting a break

Scraping multiple pages with multiple start_urls

Scrapy yeild items from multiple requests

Scrapy multiple next page

Categories

Resources