scrapy pagination is duplicating lines and stops after 9k rows - python
how are you?
I created this code, but some lines are being duplicated. For example, out of 9k rows, 3k is doubled or tripled. And after 9.112 I get a "not allowed". Is it some glueapi restriction?
This code runs up to 9,112 lines and still comes with 30% of repeated lines. Does anyone know how I can fix this?
import scrapy
import json
from scrapy.exceptions import CloseSpider
class ImoveisSpider(scrapy.Spider):
name = 'teste'
def start_requests(self):
yield scrapy.Request(
url = 'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from=0&page=1&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)
prm = 0
page = 1
def parse(self, response):
if len(json.loads(response.body)) == 0:
raise CloseSpider('No more products to scrape...')
resp = json.loads(response.body)
listings = resp.get('search').get('result').get('listings')
for info in listings:
yield {
'link': info.get('link').get('href'),
'city': info.get('link').get('data').get('city')
}
self.prm += 34
self.page += 1
yield scrapy.Request(
url=f'https://glue-api.zapimoveis.com.br/v2/listings?0=U&1=n&2=i&3=t&4=T&5=y&6=p&7=e&8=_&9=N&10=O&11=N&12=E&categoryPage=RESULT&business=SALE&listingType=USED&portal=ZAP&size=34&from={self.prm}&page={self.page}&includeFields=search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount),page,facets,fullUriFragments,developments(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),superPremium(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),premiere(search(result(listings(listing(listingsCount,sourceId,displayAddressType,amenities,usableAreas,constructionStatus,listingType,description,title,stamps,createdAt,floors,unitTypes,nonActivationReason,providerId,propertyType,unitSubTypes,unitsOnTheFloor,legacyId,id,portal,unitFloor,parkingSpaces,updatedAt,address,suites,publicationType,externalId,bathrooms,usageTypes,totalAreas,advertiserId,advertiserContact,whatsappNumber,bedrooms,acceptExchange,pricingInfos,showPrice,resale,buildings,capacityLimit,status,priceSuggestion),account(id,name,logoUrl,licenseNumber,showAddress,legacyVivarealId,legacyZapId,createdDate,minisite),medias,accountLink,link)),totalCount)),schema&developmentsSize=3&superPremiumSize=3&levels=LANDING&ref=&__zt=mtc:deduplication',
headers = {
'x-domain': 'www.zapimoveis.com.br'
},
callback=self.parse
)
Related
duplicated data scraper json api
I have this script: import scrapy from scrapy.crawler import CrawlerProcess from datetime import datetime import os if os.path.exists('jfs_hombre.csv'): os.remove('jfs_hombre.csv') print("The file has been deleted successfully") else: print("The file does not exist!") class JfsSpider_hombre(scrapy.Spider): name = 'jfs_hombre' #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"] custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} def start_requests(self): yield scrapy.Request( url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D', callback=self.parse, method="GET" ) def parse(self, response): resp = response.json() #print(resp) for item in range(0,576,32): resp['recordsFiltered']=item for result in resp['data']['productSearch']['products']: yield { 'Casa':'Just_For_Sports', 'Sku' :result['productReference'], 'Name': result['productName'], 'precio': result['priceRange']['sellingPrice']['highPrice'], 'Link': 'https://www.justforsport.com.ar' + result['link'], 'Date':datetime.today().strftime('%Y-%m-%d') } if __name__ == "__main__": process =CrawlerProcess() process.crawl(JfsSpider_hombre) process.start() It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line: for item in range(0,576,32): Thanks in advance
You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not. import scrapy from scrapy.crawler import CrawlerProcess from datetime import datetime import os if os.path.exists('jfs_hombre.csv'): os.remove('jfs_hombre.csv') print("The file has been deleted successfully") else: print("The file does not exist!") class JfsSpider_hombre(scrapy.Spider): name = 'jfs_hombre' #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"] custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} def start_requests(self): headers = {"content-type": "application/json"} yield scrapy.Request( url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D', callback=self.parse, method="GET", headers=headers, dont_filter=True ) def parse(self, response): resp = response.json() #print(resp) for item in range(0,576,32): resp['data']['productSearch']['recordsFiltered']=item for result in resp['data']['productSearch']['products']: yield { #'Casa':'Just_For_Sports', 'Sku' :result['productReference'], 'Name': result['productName'], 'precio': result['priceRange']['sellingPrice']['highPrice'], 'Link': 'https://www.justforsport.com.ar' + result['link'], # 'Date':datetime.today().strftime('%Y-%m-%d') } if __name__ == "__main__": process =CrawlerProcess() process.crawl(JfsSpider_hombre) process.start() Proven by set() import scrapy from scrapy.crawler import CrawlerProcess from datetime import datetime import os if os.path.exists('jfs_hombre.csv'): os.remove('jfs_hombre.csv') print("The file has been deleted successfully") else: print("The file does not exist!") class JfsSpider_hombre(scrapy.Spider): name = 'jfs_hombre' unique_data = set() #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"] custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} def start_requests(self): headers = {"content-type": "application/json"} yield scrapy.Request( url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D', callback=self.parse, method="GET", headers=headers, dont_filter=True ) def parse(self, response): resp = response.json() #print(resp) for item in range(0,576,32): resp['data']['productSearch']['recordsFiltered']=item for result in resp['data']['productSearch']['products']: s=result['productReference'] self.unique_data.add(s) yield { #'Casa':'Just_For_Sports', 'Sku' :s, 'Name': result['productName'], 'precio': result['priceRange']['sellingPrice']['highPrice'], 'Link': 'https://www.justforsport.com.ar' + result['link'], # 'Date':datetime.today().strftime('%Y-%m-%d') } if __name__ == "__main__": process =CrawlerProcess() process.crawl(JfsSpider_hombre) process.start() Output: 'item_scraped_count': 576,
Extracting next page and setting a break
I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page. How do I extract the next pages while setting a limit i.e. 5 pages For example, here's what I have tried: import scrapy from scrapy.item import Field from itemloaders.processors import TakeFirst from scrapy.crawler import CrawlerProcess class StatisticsItem(scrapy.Item): ebay_div = Field(output_processor=TakeFirst()) url = Field(output_processor=TakeFirst()) class StatisticsSpider(scrapy.Spider): name = 'ebay' start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' + '&LH_PrefLoc=2&mag=1&_sop=16'] def start_requests(self): for url in self.start_urls: yield scrapy.Request( url ) def parse(self, response): all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]') for card in all_cards: name = card.xpath('.//h3/text()').get() #get name of product price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product # now do whatever you want, append to dictionary, yield as item... summary_data = { "Name": name, "Price": price, "URL": product_url } data = {'summary_data': summary_data} yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details) # get the next page next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first() # The last page do not have a valid url and ends with '#' if next_page_url == None or str(next_page_url).endswith("#"): self.log("eBay products collected successfully !!!") else: print('\n' + '-' * 30) print('Next page: {}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.parse) def parse_product_details(self, response): # Get the summary data data = response.meta['summary_data'] data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first() yield data process = CrawlerProcess( settings={ 'FEED_URI': 'collectible_cards.json', 'FEED_FORMAT': 'jsonlines' } ) process.crawl(StatisticsSpider) process.start()
You can try like this first make urls then start start_requests start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]
Scraping multiple pages with multiple start_urls
I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so. import scrapy from scrapy.http import Request BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true" ] class ChangeSpider(scrapy.Spider): name = 'change' def start_requests(self): for i in range(len(BASE_URL)): yield Request(BASE_URL[i], callback = self.parse) pageNumber = 11 def parse(self, response): data = response.json() for item in range(len(data['items'])): yield { "petition_id": data['items'][item]['petition']['id'], } next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true" if data['last_page'] == False: ChangeSpider.pageNumber += 1 yield response.follow(next_page, callback=self.parse)
Try like this: import scrapy from scrapy.http import Request class ChangeSpider(scrapy.Spider): name = 'change' start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true", "https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true" ] pageNumber = 11 def parse(self, response): data = response.json() for item in range(len(data['items'])): yield { "petition_id": data['items'][item]['petition']['id'], } next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true" if data['last_page'] == False: ChangeSpider.pageNumber += 1 yield response.follow(next_page, callback=self.parse)
Scrapy yeild items from multiple requests
I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each import scrapy from scrapy.http import Request, FormRequest from ..items import PrintersItem from scrapy.utils.response import open_in_browser class PrinterSpider(scrapy.Spider): name = 'printers' start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ] def parse(self, response): items = PrintersItem() token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first() print(token) yield FormRequest.from_response(response, formnumber=1, formdata={ 'CSRFToken' : token, 'B55d' : 'password', 'loginurl' : '/general/status.html' }, callback=self.postlogin2) def postlogin2(self,response): items = PrintersItem() contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract() location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract() items['contact'] = contact items['location'] = location yield Request( url = response.url.split('/general')[0] + "/general/information.html?kind=item", callback=self.action) for items in self.postlogin2(response): yield items def action(self,response): drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract() items['drum'] = drum print(drum) printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract() items['printermodel'] = printermodel yield Request( url = response.url.split('/general')[0] + "/net/wired/tcpip.html", callback=self.action2) for items in self.action(response): yield items def action2(self, response): tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract() items['tcpip'] = tcpip for items in self.action2(response): yield items
If you want to send items from parse to postlogin2, etc. then add it as meta data in Request yield Request( ..., meta={"items": items}) and get it in other function items = response.meta["items"] and yield it only in the last function yield items Doc: Request and Response, Request.meta special keys class PrinterSpider(scrapy.Spider): name = 'printers' start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ] def parse(self, response): token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first() print(token) yield FormRequest.from_response(response, formnumber=1, formdata={ 'CSRFToken' : token, 'B55d' : 'password', 'loginurl' : '/general/status.html' }, callback=self.postlogin2) def postlogin2(self, response): items = PrintersItem() contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract() location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract() items['contact'] = contact items['location'] = location yield Request( #url=response.urljoin("/general/information.html?kind=item"), url=response.url.split('/general')[0] + "/general/information.html?kind=item", callback=self.action, meta={"items": items}) def action(self, response): items = response.meta["items"] drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract() items['drum'] = drum print(drum) printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract() items['printermodel'] = printermodel yield Request( #url=response.urljoin("/net/wired/tcpip.html"), url=response.url.split('/general')[0] + "/net/wired/tcpip.html", callback=self.action2, meta={"items": items}) def action2(self, response): items = response.meta["items"] tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract() items['tcpip'] = tcpip yield items
Scrapy multiple next page
I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that. alphabet = string.ascii_uppercase each_link = '.' + alphabet each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link] #sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps #full_url = each_url + sub_page_of_url class AnimeScraper_Spider(scrapy.Spider): name = "Anime" def start_requests(self): for url in each_url: yield scrapy.Request(url=url, callback= self.parse) def parse(self, response): next_page_url = response.xpath( "//div[#class='bgColor1']//a[text()='Next']/#href").extract_first() for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') : url = response.urljoin(href.extract()) yield Request(url, callback = self.parse_anime) yield Request(next_page_url, callback=self.parse) def parse_anime(self, response): for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'): return { "title" : tr_sel.css('a[id] strong::text').extract_first().strip(), "synopsis" : tr_sel.css("div.pt4::text").extract_first(), "type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(), "episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(), "rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip() }
I think that you're trying something too complicated, it should be as simple as: Start from the main page Identify all the pages that start with a particular letter For each of these pages, take all the next links and repeat It looks something like that: import string import scrapy from scrapy import Request class AnimeSpider(scrapy.Spider): name = "Anime" start_urls = ['https://myanimelist.net/anime.php'] def parse(self, response): xp = "//div[#id='horiznav_nav']//li/a/#href" return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract()) def parse_anime_list_page(self, response): for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'): yield { "title": tr_sel.css('a[id] strong::text').extract_first().strip(), "synopsis": tr_sel.css("div.pt4::text").extract_first(), "type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(), "episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(), "rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(), } next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract() for next_url in next_urls: yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)