Result is not saved in json

Result is not saved in json - python

I am using scrapy and running this script:
import scrapy
from ..items import SizeerItem
from scrapy.http.request import Request
class SizeerSpiderSpider(scrapy.Spider):
name = 'sizeer'
pg = 0
currentPg = 2
start_urls = [
'https://sizeer.lt/moterims'
]
def parse(self, response):
items = SizeerItem()
pages = response.xpath("//nav[#class='m-pagination']//span[3]/text()").extract()
pages = list(dict.fromkeys(pages))
if self.pg == 0:
pages = list(int(s) for s in pages[0].split() if s.isdigit())
self.pg = pages[0]
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
while self.currentPg <= self.pg:
url = response.request.url + "?sort=default&limit=60&page=" + str(self.currentPg)
self.currentPg += 1
yield Request(url, callback=self.parse)
This way:
scrapy crawl sizeer -s FEED_URI='mydata.json' -s FEED_FORMAT=json
But after that my mydata.json is empty. This is my first time trying to 'play' with it and can't really understand where is the issue.

You also need to yield the items you scrape so Scrapy Engine will run them through the pipelines and thorugh the Feed Export (which is what you need to export to the file).
Since yield is non-blocking you can add just after populating it and the function will still yield your requests after:
...
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
yield items # <<< Here for example
while self.currentPg <= self.pg:
...
As #yordan pointed out, you can simplify the way you are executing the spider like this: (However it's not the solution to the problem)
scrapy crawl sizeer -o mydata.json

Try this one:
Scrapy use item and save data in a json file
Pay attention to the yielding and the calling of the spider.

Related

When accessing a class variable updated in a method, its updated value is not picked up in another method in scrapy spider

I am trying to parse a public forum that contains multiple threads. I need to store metadata of that thread. These metadata appear before getting inside the thread i.e in the page which displays the list of discussion threads.
In my scrapy code below, I need to access values from parse() method in parse_contents() method. I am storing those values in class variables but the parse_contents() picks up the first value that was assigned the very first time although the new value has been assigned before calling parse_contents().
Here is my spider class
import scrapy
import re
import pandas as pd
import time
from functools import reduce
from ..items import PostsItem
class SpiderSpider(scrapy.Spider):
name = 'posts'
page_count = 1
forum_count = 0
#Create an item container to store all this data
post_item = PostsItem()
# I want these variables to parse_contents() method
post_subject_last_message_date = ""
total_posts = 0
start_urls = [
# 'https://www.dcurbanmom.com/jforum/posts/list/150/946237.page'
'https://www.dcurbanmom.com/jforum/forums/show/32.page'
]
# Grabs the list of threads in the DCPS forum
def parse(self, response):
for next_forum in response.xpath('//span[#class="topictitle"]'):
next_forum_link = next_forum.xpath('.//a/#href')
next_forum_url = response.urljoin(next_forum_link.extract_first())
last_message = next_forum.xpath('.//ancestor::td[1]/following-sibling::td[4]/span/text()')
self.post_subject_last_message_date = last_message.get() #This needs to be picked up by parse_contents
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents)
#Get next page of duscussion threads list
#Some code here
#Parses individual discussion thread
def parse_contents(self, response):
all_posts = response.xpath('//table[#class="forumline"]//tr')
post_text = ""
for post in all_posts:
post_text_response = post.xpath(".//div[#class='postbody']/br/following-sibling::text()[1] | .//div[#class='postbody']/br/following-sibling::a[1]/text() | .//div[#class='postbody']/text() | .//div[#class='postbody']/a/text()")
if(len(post_text_response.getall())>0):
post_text = "".join(re.sub('\r','',x) for x in post_text_response.getall()).strip()
#Populate the item container
if(bool(re.search(r'^\s*$', post_text))==False):
self.post_item['post_message'] = post_text
# !!! This is not picking up the value updated in the parse method !!!
self.post_item['post_subject_last_message_date'] = self.post_subject_last_message_date
post_text = ""
yield(self.post_item)
# Go to next page in this discussion thread
# Some code here
How can I fix this?
Edit: removed some lines of code to make it easier to read

replacing yield scrapy.Request(url = next_forum_url, callback=self.parse_contents) with the following fixed it for me
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents, cb_kwargs = {
'post_subject_answers': post_subject_answer,
'post_subject_first_post_date':post_subject_first_post_date,
'post_subject_views':post_subject_views,
'post_subject_last_message_date':post_subject_last_message_date
})

Scrapy Callback Function not scraping the Entire Data?

First of all this is my code-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/#href').extract())[3:-2]
point1 = sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/#class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[not (#class="name") or not(#class="description") or not(#class="qty") or not(#class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="image"]/a[not (#class="sft_quickshop_icon")]/img[#class="img-responsive"]/#src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
Now, these are the issues that I am facing.
1. There are numerous divs that can be found with this xpath - '//html/body/div/div/div[4]/div/div/div[5]/div' . However, the above code scrapes the contents only of the first div , i.e , having the xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' , and not all of them.
The moment I comment these three lines, the scraper scrapes everything, obviously then I am not able to add the 'add' field in the item-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
So, I want to scrape all the divs , in addition with the 'add' field in my item Class (notice the class DmozItem). How do I do that? Please give a corrected code for my SPECIFIC case, it would be best that way!
2. Secondly, as I said, as I comment the three lines, that I mentioned above, then the program scrapes everything in a time close to 5 seconds (around 4.9 seconds).
But as soon as I un-comment, those 3 lines (again those that I mentioned above), the program's run-time exceeds drastically, and it runs in a time close to 9 seconds (around 8.8 - 8.9 seconds). Why does this happen? Is that because of this - dont_filter=True? Please suggest ways to overcome this, as the run-time can prove to be a very big problem for me. Also, can I decrease the initial time of 5 seconds (around 4.9) somehow?

Use html/body/div/div/div[4]/div/div/div[5]//div to get all divs after div[5].
EDIT:
This is the correct xpath - //html/body/div/div/div[4]/div/div/div[5]/div, that gave all the div's after div[5]. The previous one mentioned, gave multiple errors!
If you do a return statement inside the loop you end the whole method execution. So if you enable those three lines you end the execution of your method (and the for loop) after the first element.
This means you should yield your request instead of returning it.

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item

You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

How do I request callback on a URL that I first scraped to get?

Just started toying around with scrapy for a bit to help scrape some fantasy basketball stats. My main problem is in my spider - how do I scrape the href attribute of a link and then callback another parser on that url?
I looked into link extractors, and I think this might be my solution but I'm not sure. I've re-read it over and over again, and still am confused on where to start. The following is the code I have so far.
def parse_player(self, response):
player_name = "Steven Adams"
sel = Selector(response)
player_url = sel.xpath("//a[text()='%s']/#href" % player_name).extract()
return Request("http://sports.yahoo.com/'%s'" % player_url, callback = self.parse_curr_stats)
def parse_curr_stats(self, response):
sel = Selector(response)
stats = sel.xpath("//div[#id='mediasportsplayercareerstats']//table[#summary='Player']/tbody/tr[last()-1]")
items =[]
for stat in stats:
item = player_item()
item['fgper'] = stat.xpath("td[#title='Field Goal Percentage']/text()").extract()
item['ftper'] = stat.xpath("td[#title='Free Throw Percentage']/text()").extract()
item['treys'] = stat.xpath("td[#title='3-point Shots Made']/text()").extract()
item['pts'] = stat.xpath("td[#title='Points']/text()").extract()
item['reb'] = stat.xpath("td[#title='Total Rebounds']/text()").extract()
item['ast'] = stat.xpath("td[#title='Assists']/text()").extract()
item['stl'] = stat.xpath("td[#title='Steals']/text()").extract()
item['blk'] = stat.xpath("td[#title='Blocked Shots']/text()").extract()
item['tov'] = stat.xpath("td[#title='Turnovers']/text()").extract()
item['fga'] = stat.xpath("td[#title='Field Goals Attempted']/text()").extract()
item['fgm'] = stat.xpath("td[#title='Field Goals Made']/text()").extract()
item['fta'] = stat.xpath("td[#title='Free Throws Attempted']/text()").extract()
item['ftm'] = stat.xpath("td[#title='Free Throws Made']/text()").extract()
items.append(item)
return items
So as you can see, in the first parse function, you're given a name, and you look for the link on the page that will guide you to their individual page, which is stored in "player_url". How do I then go to that page and run the 2nd parser on it?
I feel as if I am completely glossing over something and if someone could shed some light it would be greatly appreciated!

When you want to send a Request object, just use yield rather than return like this:
def parse_player(self, response):
......
yield Request(......)
If there are many Request objects that you want to send in a single parse method, a best practic is like this:
def parse_player(self, response):
......
res_objs = []
# then add every Request object into 'res_objs' list,
# and in the end of the method, do the following:
for req in res_objs:
yield req
I think when the scrapy spider is running, it will handle requests under the hood like this:
# handle requests
for req_obj in self.parse_play():
# do something with *Request* object
So just remember use yield to send Request objects.

Scraping Project Euler site with scrapy [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I'm trying to scrape projecteuler.net with python's scrapy library, just to make practice with it. I've seen online more than one existent implementation of such a scraper, but they seem just too much elaborated for me. I want simply to save the problems (titles, ids, contents) in a json and next loading with ajax in a local webpage on my pc.
I'm implementing my solution that I will terminate anyway, but since I want to discover the smarter way to use the library, I'm asking you to propose the most intelligent programs with scrapy for doing this job (if you want to avoid the json way, and save directly in html... for me may be even better).
This is my first approach (doesn't work):
# -*- coding: utf-8 -*-
import httplib2
import requests
import scrapy
from eulerscraper.items import Problem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
def start_urls_detection():
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return ["https://projecteuler.net/"]
class EulerSpider(CrawlSpider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = start_urls_detection()
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
Rule(LinkExtractor(allow=('problem=\d*',)), callback="parse_problems"),
Rule(LinkExtractor(allow=('archives;page=\d*',), unique=True), follow=True)
)
def start_requests(self):
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
# def parse_content(self, response):
# #return response.css("div.problem_content::text").extract()
# next_page = "https://projecteuler.net/archives;page=2"
# n = 3
#
# while n < 14:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
# next_page = next_page[0:len(next_page) - 1] + str(n)
# n += 1
now I will try with some linkExtractor + manual requests combo. In the meantime, I hopefully wait for your solutions...

I think I have found a simplest yet fitting solution (at least for my purpose), in respect to existent code written to scrape projecteuler:
# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
class EulerSpider(scrapy.Spider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
def parse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i in range(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
From the start page (archives) I follow every single link to a problem, scraping the data that I need with parse_problems. Then I launch the scraper for the other pages of the site, with the same procedure for every list of link.
Also the Item definition with pre and post processes is very clean:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
def extract_first_number(text):
i = re.search('\d+', text)
return int(text[i.start():i.end()])
def array_to_value(element):
return element[0]
class Problem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
I launch this with the command scrapy crawl euler -o euler.json and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I'm going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.
EDIT: in fact it is simple, using this pipeline
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
def close_spider(self, spider):
ordered_list = [None for i in range(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
def process_item(self, item, spider):
self.list_items.append(item)
return item
though the best solution may be to create a custom exporter:
from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes
class OrderedJsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
# To initialize the object we use JsonItemExporter's constructor
super().__init__(file)
self.list_items = []
def export_item(self, item):
self.list_items.append(item)
def finish_exporting(self):
ordered_list = [None for i in range(len(self.list_items))]
for i in self.list_items:
ordered_list[int(i['id'] - 1)] = i
for i in ordered_list:
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(i))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
self._beautify_newline()
self.file.write(b"]")
and configure it in settings to call it for json:
FEED_EXPORTERS = {
'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Result is not saved in json - python

Try this one: Scrapy use item and save data in a json file Pay attention to the yielding and the calling of the spider.

Related

When accessing a class variable updated in a method, its updated value is not picked up in another method in scrapy spider

Scrapy Callback Function not scraping the Entire Data?

Scrapy (Python): Iterating over 'next' page without multiple functions

How do I request callback on a URL that I first scraped to get?

Scraping Project Euler site with scrapy [closed]

Categories

Resources