.json export formating in Scrapy - python

Just a quick question about json export formatting in Scrapy. My exported file looks like this.
{"pages": {"title": "x", "text": "x", "tags": "x", "url": "x"}}
{"pages": {"title": "x", "text": "x", "tags": "x", "url": "x"}}
{"pages": {"title": "x", "text": "x", "tags": "x", "url": "x"}}
But I would like it to be in this exact format. Somehow I need to get all the other information under "pages".
{"pages": [
{"title": "x", "text": "x", "tags": "x", "url": "x"},
{"title": "x", "text": "x", "tags": "x", "url": "x"},
{"title": "x", "text": "x", "tags": "x", "url": "x"}
]}
I'm not very experienced in scrapy or python, but I have gotten everything else done in my spider except the export format. This is my pipelines.py, which I just got working.
from scrapy.exporters import JsonItemExporter
import json
class RautahakuPipeline(object):
def open_spider(self, spider):
self.file = open('items.json', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
These are the items in my spider.py I need to extract
items = []
for title, text, tags, url in zip(product_title, product_text, product_tags, product_url):
item = TechbbsItem()
item['pages'] = {}
item['pages']['title'] = title
item['pages']['text'] = text
item['pages']['tags'] = tags
item['pages']['url'] = url
items.append(item)
return items
Any help is greatly appreciated, as this is the last obstacle in my project.
EDIT
items = {'pages':[{'title':title,'text':text,'tags':tags,'url':url}
for title, text, tags, url in zip(product_title, product_text, product_tags, product_url)]}
This extracts the .json in this format
{"pages": [{"title": "x", "text": "x", "tags": "x", "url": "x"}]}
{"pages": [{"title": "x", "text": "x", "tags": "x", "url": "x"}]}
{"pages": [{"title": "x", "text": "x", "tags": "x", "url": "x"}]}
This is getting better but I would still need only one "pages" on the start of the file and everything else inside an array under it.
EDIT 2
I think my spider.py is the reason why "pages" gets added to every line in the .json file and I should have originally posted the whole code of it. Here it is.
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urljoin
class TechbbsItem(scrapy.Item):
pages = scrapy.Field()
title = scrapy.Field()
text= scrapy.Field()
tags= scrapy.Field()
url = scrapy.Field()
class TechbbsSpider(scrapy.Spider):
name = 'techbbs'
allowed_domains = ['bbs.io-tech.fi']
start_urls = ['https://bbs.io-tech.fi/forums/prosessorit-emolevyt-ja-muistit.73/?prefix_id=1' #This is a list page full of used pc-part listings
]
def parse(self, response): #This visits product links in the product list page
links = response.css('a.PreviewTooltip::attr(href)').extract()
for l in links:
url = response.urljoin(l)
yield scrapy.Request(url, callback=self.parse_product)
next_page_url = response.xpath('//a[contains(.,"Seuraava ")]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_product(self, response): #This extracts data from inside the links
product_title = response.xpath('normalize-space(//h1/span/following-sibling::text())').extract()
product_text = response.xpath('//b[contains(.,"Hinta:")]/following-sibling::text()[1]').re('([0-9]+)')
tags = "tags" #This is just a placeholder
product_tags = tags
product_url = response.xpath('//html/head/link[7]/#href').extract()
items = []
for title, text, tags, url in zip(product_title, product_text, product_tags, product_url):
item = TechbbsItem()
item['pages'] = {}
item['pages']['title'] = title
item['pages']['text'] = text
item['pages']['tags'] = tags
item['pages']['url'] = url
items.append(item)
return items
So my spider starts crawling from a page full of product listings. It visits every one of the 50 product links and scrapes 4 items, title, text, tags and url. After scraping every link in one page, it goes to next one and so on. I suspect the loops in the code prevent your suggestions from working for me.
I would like to get the .json export to the exact form mentioned in the original question. Se there would be {"pages": [ on the beginning of the file, then all the indented item lines
{"title": "x", "text": "x", "tags": "x", "url": "x"}, and in the end ]}

In terms of memory usage, it's not a good practice, but an option is to keep an object and write it at the end of the process:
class RautahakuPipeline(object):
def open_spider(self, spider):
self.items = { "pages":[] }
self.file = null # open('items.json', 'w')
def close_spider(self, spider):
self.file = open('items.json', 'w')
self.file.write(json.dumps(self.items))
self.file.close()
def process_item(self, item, spider):
self.items["pages"].append(dict(item))
return item
Then, if memory is an issue (must be treat with attention anyway), try writing the json file as follows:
class RautahakuPipeline(object):
def open_spider(self, spider):
self.file = open('items.json', 'w')
header='{"pages": ['
self.file.write(header)
def close_spider(self, spider):
footer=']}'
self.file.write(footer)
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
I hope it helps.

Using list comprehension. I donĀ“t know how looks your data , but using a toy example:
product_title = range(1,10)
product_text = range(10,20)
product_tags = range(20,30)
product_url = range(30,40)
item = {'pages':[{'title':title,'text':text,'tags':tags,'url':url}
for title, text, tags, url in zip(product_title, product_text, product_tags, product_url)]}
I get this result:
{'pages': [{'tags': 20, 'text': 10, 'title': 1, 'url': 30},
{'tags': 21, 'text': 11, 'title': 2, 'url': 31},
{'tags': 22, 'text': 12, 'title': 3, 'url': 32},
{'tags': 23, 'text': 13, 'title': 4, 'url': 33},
{'tags': 24, 'text': 14, 'title': 5, 'url': 34},
{'tags': 25, 'text': 15, 'title': 6, 'url': 35},
{'tags': 26, 'text': 16, 'title': 7, 'url': 36},
{'tags': 27, 'text': 17, 'title': 8, 'url': 37},
{'tags': 28, 'text': 18, 'title': 9, 'url': 38}]}

items = {}
#item = TechbbsItem() # not sure what this is doing?
items['pages'] = []
for title, text, tags, url in zip(product_title, product_text, product_tags, product_url):
temp_dict = {}
temp_dict['title'] = title
temp_dict['text'] = text
temp_dict['tags'] = tags
temp_dict['url'] = url
items["pages"].append(temp_dict)
return items

Related

How to match the ITEM with a record in the database?

I have storing URLs in a database table:
scrapy_id | scrapy_name | url
------------+---------------+-----------------
111 | aaa | http://url1.com
222 | bbb | http://url2.com
333 | ccc | http://url3.com
I need start request from the urls, so I initial the database connection in open_spider of pipeline:
class PgsqlPipeline(object):
...
def open_spider(self, spider):
self.conn = psycopg2.connect(database=self.XXX, user=self.XXX, password=self.XXX)
self.cur = self.conn.cursor()
spider.myPipeline = self
def get_urls(self):
get_urls_sql = """
SOME_SQL_STATMENTS
"""
self.cur.execute(get_urls_sql)
rows = self.cur.fetchall()
return rows
...
then, in spider:
....
class SephoraSpider(Spider):
name = 'XXX'
allowed_domains = ['XXX']
def start_requests(self):
for row in self.myPipeline.get_urls():
self.item = SomeItem()
url = str(row[2])
self.item['id'] = row[0]
self.item['name'] = row[1]
yield Request(self.url, callback=self.parse_item)
def parse_item(self, response):
self.item['text'] = response.xpath('XXXX').get()
return self.item
....
in items:
....
class SomeItem(Item):
id = Field()
name = Field()
text = Field()
....
I want to get following item:
{
"id": 111,
"name": "aaa",
"text": response1,
},
{
"id": 222,
"name": "bbb",
"text": response2,
},
{
"id": 333,
"name": "ccc",
"text": response3,
}
But I get:
{
"id": 333,
"name": "ccc",
"text": response1,
},
{
"id": 333,
"name": "ccc",
"text": response2,
},
{
"id": 333,
"name": "ccc",
"text": response3,
}
The problem may be that I put self.item = SomeItem() in start_requests(), But if I put self.item = SomeItem() in parse_item(), I can not get id and name, It means that cannot match the resolved response with its ID.
How to match the ITEM with a record in the database?
You can't use self to store the request metadata because you are setting it only while starting the requests; you need the data to be persisted with the request, not with the SephoraSpider class instance. In the parse_item callback, it will just be set to the value of the last request you started. Instead, you can use the Request.meta field:
class SephoraSpider(Spider):
name = 'XXX'
allowed_domains = ['XXX']
def start_requests(self):
for row in self.myPipeline.get_urls():
url = str(row[2])
item = {'id': row[0], 'name': row[1], 'url': row[2]}
yield Request(self.url, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
item = response.meta['item']
item['text'] = response.xpath('XXXX').get()
return item
Details in the docs.

Python Scrapy Multilevel Request (Three Level)

I am new to Python Scrapy and I am trying to create JSON file from 3 levels of nested pages. I have following structure:
Page 1 (start): contains links of second page (called Mangas)
Page 2: Contains nested Volumes and Chapters
Page 3: Each Chapter contains multiple images
My Code
import scrapy
import time
import items
import json
class GmangaSpider(scrapy.Spider):
name = "gmanga"
start_urls = [
"http://gmanga.me/mangas"
]
def parse(self, response):
# mangas = []
for manga in response.css('div.manga-item'):
link = manga.css('a.manga-item-content').xpath('#href').extract_first()
if link:
page_link = "http://gmanga.me%s" % link
mangas = items.Manga()
mangas['cover'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#src').extract_first()
mangas['title'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#alt').extract_first()
mangas['link'] = page_link
mangas['volumes'] = []
yield scrapy.Request(page_link, callback=self.parse_volumes, meta = {"mangas": mangas})
def parse_volumes(self, response):
mangas = response.meta['mangas']
for manga in response.css('div.panel'):
volume = items.Volume()
volume['name'] = manga.css('div.panel-heading .panel-title a::text').extract_first()
volume['chapters'] = []
for tr in manga.css('div.panel-collapse .panel-body table tbody tr'):
chapter = items.Chapter()
chapter['name'] = tr.css('td:nth_child(1) div::text').extract_first()
chapter_link = tr.css('td:nth_child(3) a::attr("href")').extract_first()
chapter['link'] = chapter_link
request = scrapy.Request("http://gmanga.me%s" % chapter_link, callback = self.parse_images, meta = {"chapter": chapter})
yield request
volume['chapters'].append(chapter)
mangas['volumes'].append(volume)
yield mangas
def parse_images(self, response):
chapter = response.meta['chapter']
data = response.xpath("//script").re("alphanumSort\((.*])")
if data:
images = json.loads(data[0])
chapter['images'] = images
return chapter
My Items.py
from scrapy import Item, Field
class Manga(Item):
title = Field()
cover = Field()
link = Field()
volumes = Field()
class Volume(Item):
name = Field()
chapters = Field()
class Chapter(Item):
name = Field()
images = Field()
link = Field()
Now I am bit confused in parse_volumes function where to yield or return to get following structure in json file.
Expected Result:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/151/medium_143061.jpg",
"link": "http://gmanga.me/mangas/gokko",
"volumes": [{
"name": "xyz",
"chapters": [{
"link": "/mangas/gokko/4/3asq",
"name": "4",
"images": ["img1.jpg", "img2.jpg"]
}, {
"link": "/mangas/gokko/3/3asq",
"name": "3",
"images": ["img1.jpg", "img2.jpg"]
}]
}],
"title": "Gokko"
}]
But I am getting images node as separate node it must be within chapters node of volume:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/10581/medium_I2.5HFzVh7e.png",
"link": "http://gmanga.me/mangas/godess-creation-system",
"volumes": [{
"name": "\u0627\u0644\u0645\u062c\u0644\u062f ",
"chapters": [{
"link": "/mangas/godess-creation-system/1/ayou-cahn",
"name": "1"
}]
}],
"title": "Godess Creation System"
},
{
"images": ["http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/01.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/02.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/03.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/04.jpg?ak=p0skml"],
"link": "/mangas/reversal/1/Lolly-Pop",
"name": "1"
}]
Each function is individually fetching data properly, the only issue is JSON formation. It is not writing to json file properly. Please lead me where I am wrong.

How to produce custom JSON output from Scrapy?

I am working on a Scrapy script which should make output like:
{
"state": "FL",
"date": "2017-11-03T14:52:26.007Z",
"games": [
{
"name":"Game1"
},
{
"name":"Game2"
}
]
}
But for me it is making as below when I run scrapy crawl items -o data.json -t json. The repetition of state
[
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
]
The code is given below:
import scrapy
items.py
class Item(scrapy.Item):
state = scrapy.Field()
games = scrapy.Field()
In Spider file, item class is called as:
item = Item()
item['state'] = state
item['Date'] = '2014-10-04'
item['games'] = games
I know this is not complete code but it should give an idea what I am all about.
Ref. https://stackoverflow.com/a/43698923/8964297
You could try to write your own pipeline like this:
Put this into your pipelines.py file:
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('scraped_items.json', 'w')
# Your scraped items will be saved in the file 'scraped_items.json'.
# You can change the filename to whatever you want.
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
line = json.dumps(
dict(item),
indent = 4,
sort_keys = True,
separators = (',', ': ')
) + ",\n"
self.file.write(line)
return item
Then modify your settings.py to include the following:
ITEM_PIPELINES = {
'YourSpiderName.pipelines.JsonWriterPipeline': 300,
}
Change YourSpiderName to the correct name of your spider.
Note that the file gets written directly by the pipeline, so you don't have to specify file and format with the -o and -t command line parameters.
Hope this gets you closer to what you need.

Scrapy - Creating nested JSON Object

I'm learning how to work with Scrapy while refreshing my knowledge in Python?/Coding from school.
Currently, I'm playing around with imdb top 250 list but struggling with a JSON output file.
My current code is:
# -*- coding: utf-8 -*-
import scrapy
from top250imdb.items import Top250ImdbItem
class ActorsSpider(scrapy.Spider):
name = "actors"
allowed_domains = ["imdb.com"]
start_urls = ['http://www.imdb.com/chart/top']
# Parsing each movie and preparing the url for the actors list
def parse(self, response):
for film in response.css('.titleColumn'):
url = film.css('a::attr(href)').extract_first()
actors_url = 'http://imdb.com' + url[:17] + 'fullcredits?ref_=tt_cl_sm#cast'
yield scrapy.Request(actors_url, self.parse_actor)
# Finding all actors and storing them on item
# Refer to items.py
def parse_actor(self, response):
final_list = []
item = Top250ImdbItem()
item['poster'] = response.css('#main img::attr(src)').extract_first()
item['title'] = response.css('h3[itemprop~=name] a::text').extract()
item['photo'] = response.css('#fullcredits_content .loadlate::attr(loadlate)').extract()
item['actors'] = response.css('td[itemprop~=actor] span::text').extract()
final_list.append(item)
updated_list = []
for item in final_list:
for i in range(len(item['title'])):
sub_item = {}
sub_item['movie'] = {}
sub_item['movie']['poster'] = [item['poster']]
sub_item['movie']['title'] = [item['title'][i]]
sub_item['movie']['photo'] = [item['photo']]
sub_item['movie']['actors'] = [item['actors']]
updated_list.append(sub_item)
return updated_list
and my output file is giving me this JSON composition:
[
{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Shawshank Redemption"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Tim Robbins","Morgan Freeman",...]]}
},{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Godfather"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Alexandre Rodrigues", "Leandro Firmino", "Phellipe Haagensen",...]]}
}
]
but I'm looking to achieve this:
{
"movies": [{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Shawshank Redemption",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Tim Robbins"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Morgan Freeman"},...
]
},{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Godfather",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Marlon Brando"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Al Pacino"},...
]
}]
}
in my items.py file I have the following:
import scrapy
class Top250ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# Items from actors.py
poster = scrapy.Field()
title = scrapy.Field()
photo = scrapy.Field()
actors = scrapy.Field()
movie = scrapy.Field()
pass
I'm aware of the following things:
My results are not coming out in order, the 1st movie on web page list is always the first movie on my output file but the rest is not. I'm still working on that.
I can do the same thing but working with Top250ImdbItem(), still browsing around how that is done in a more detailed way.
This might not be the perfect layout for my JSON, suggestions are welcomed or if it is, let me know, even though I know there is no perfect way or "the only way".
Some actors don't have a photo and it actually loads a different CSS selector. For now, I would like to avoid reaching for the "no picture thumbnail" so it's ok to leave those items empty.
example:
{"photo": "", "name": "Al Pacino"}
Question: ... struggling with a JSON output file
Note: Can't use your ActorsSpider, get Error: Pseudo-elements are not supported.
# Define a `dict` **once**
top250ImdbItem = {'movies': []}
def parse_actor(self, response):
poster = response.css(...
title = response.css(...
photos = response.css(...
actors = response.css(...
# Assuming List of Actors are in sync with List of Photos
actors_list = []
for i, actor in enumerate(actors):
actors_list.append({"name": actor, "photo": photos[i]})
one_movie = {"poster": poster,
"title": title,
"actors": actors_list
}
# Append One Movie to Top250 'movies' List
top250ImdbItem['movies'].append(one_movie)

how to retrieve data ... The page is loaded using ajax

I want to get costs of mobile phones from this site
http://www.univercell.in/buy/SMART
i tried to test it so i used:
scarpy shell http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=2&VIEW_SIZE=15&serachupload=&sortupload=
But I am not able to connect to this site. As the page is loaded using ajax I found out the start_url using firebug. Can any one suggest me where I am going wrong
How about writing a JavaScript script to perform the actions that are already performed when clicking the page number and then simply dump the XML that is returned from the server. I mean try to make the calls to the server as if the site was hosted on your Desktop!
The JavaScript function called when you hit a number is paginateList('numberOfPage') where numberOfPage is the page you want to visit.
The body of the function is
function paginateList(viewIndex) {
var productCategoryId = document.pageSelect.category_id.value;
var viewSize = document.pageSelect.VIEW_SIZE.value;
var min = "";
if(document.pageSelect.min!=null)
min = document.pageSelect.min.value;
var max = "";
if(document.pageSelect.max!=null)
max = document.pageSelect.max.value;
var attrName = "";
if(document.pageSelect.attrName!=null)
attrName = document.pageSelect.attrName.value;
if(attrName==""){
var commaAttr=document.getElementById('commaAttr');
attrName=commaAttr.value;
}
var limitView = 'true';
var sortSearchPrice = "";
if(document.pageSelect.sortSearchPrice!=null)
sortSearchPrice = document.pageSelect.sortSearchPrice.value;
var url2="/control/AjaxCategoryDetail?productCategoryId="+productCategoryId+"&category_id="+productCategoryId+"&attrName="+attrName+"&min="+min+"&max="+max+"&sortSearchPrice="+sortSearchPrice+"&VIEW_INDEX="+viewIndex+"&VIEW_SIZE="+viewSize+"&serachupload=&sortupload=";
pleaseWait('Y');
jQuery.ajax({url: url2,
data: null,
type: 'post',
async: false,
success: function(data) {
$('#searchResult').html(data);
pleaseWait('N');
},
error: function(data) {
alert("Error during product searching");
}
});
Use these to get the data from each page recursively.
Hope it helps!
Here's your spider:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class UnivercellItem(Item):
vendor = Field()
model = Field()
price = Field()
BASE_URL = "http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=%s&VIEW_SIZE=15&serachupload=&sortupload="
class UnivercellSpider(BaseSpider):
name = "univercell_spider"
allowed_domains = ["www.univercell.in"]
start_urls = [BASE_URL % index for index in range(1, 21)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
mobiles = hxs.select("//div[#class='productsummary']")
print mobiles
for mobile in mobiles:
item = UnivercellItem()
item['vendor'] = mobile.select('.//div[1]/div/text()').extract()[0].strip()
item['model'] = mobile.select('.//div[3]/div[1]/a/text()').extract()[0].strip()
item['price'] = mobile.select('.//span[#class="regularPrice"]/span/text()').extract()[0].strip()
yield item
Save it to spider.py and run via scrapy runspider spider.py -o output.json. Then in output.json you will see:
{"model": "T375", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P725 Optimus 3D Max", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P705 Optimus L7", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "9320 Curve", "vendor": "Blackberry", "price": "Special Price Click Here"}
{"model": "Xperia Sola", "vendor": "Sony", "price": "Rs.14,500.00"}
{"model": "Xperia U", "vendor": "Sony", "price": "Special Price Click Here"}
{"model": "Lumia 610", "vendor": "Nokia", "price": "Special Price Click Here"}
...
Hope that helps.

Categories

Resources