I am new to Python Scrapy and I am trying to create JSON file from 3 levels of nested pages. I have following structure:
Page 1 (start): contains links of second page (called Mangas)
Page 2: Contains nested Volumes and Chapters
Page 3: Each Chapter contains multiple images
My Code
import scrapy
import time
import items
import json
class GmangaSpider(scrapy.Spider):
name = "gmanga"
start_urls = [
"http://gmanga.me/mangas"
]
def parse(self, response):
# mangas = []
for manga in response.css('div.manga-item'):
link = manga.css('a.manga-item-content').xpath('#href').extract_first()
if link:
page_link = "http://gmanga.me%s" % link
mangas = items.Manga()
mangas['cover'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#src').extract_first()
mangas['title'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#alt').extract_first()
mangas['link'] = page_link
mangas['volumes'] = []
yield scrapy.Request(page_link, callback=self.parse_volumes, meta = {"mangas": mangas})
def parse_volumes(self, response):
mangas = response.meta['mangas']
for manga in response.css('div.panel'):
volume = items.Volume()
volume['name'] = manga.css('div.panel-heading .panel-title a::text').extract_first()
volume['chapters'] = []
for tr in manga.css('div.panel-collapse .panel-body table tbody tr'):
chapter = items.Chapter()
chapter['name'] = tr.css('td:nth_child(1) div::text').extract_first()
chapter_link = tr.css('td:nth_child(3) a::attr("href")').extract_first()
chapter['link'] = chapter_link
request = scrapy.Request("http://gmanga.me%s" % chapter_link, callback = self.parse_images, meta = {"chapter": chapter})
yield request
volume['chapters'].append(chapter)
mangas['volumes'].append(volume)
yield mangas
def parse_images(self, response):
chapter = response.meta['chapter']
data = response.xpath("//script").re("alphanumSort\((.*])")
if data:
images = json.loads(data[0])
chapter['images'] = images
return chapter
My Items.py
from scrapy import Item, Field
class Manga(Item):
title = Field()
cover = Field()
link = Field()
volumes = Field()
class Volume(Item):
name = Field()
chapters = Field()
class Chapter(Item):
name = Field()
images = Field()
link = Field()
Now I am bit confused in parse_volumes function where to yield or return to get following structure in json file.
Expected Result:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/151/medium_143061.jpg",
"link": "http://gmanga.me/mangas/gokko",
"volumes": [{
"name": "xyz",
"chapters": [{
"link": "/mangas/gokko/4/3asq",
"name": "4",
"images": ["img1.jpg", "img2.jpg"]
}, {
"link": "/mangas/gokko/3/3asq",
"name": "3",
"images": ["img1.jpg", "img2.jpg"]
}]
}],
"title": "Gokko"
}]
But I am getting images node as separate node it must be within chapters node of volume:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/10581/medium_I2.5HFzVh7e.png",
"link": "http://gmanga.me/mangas/godess-creation-system",
"volumes": [{
"name": "\u0627\u0644\u0645\u062c\u0644\u062f ",
"chapters": [{
"link": "/mangas/godess-creation-system/1/ayou-cahn",
"name": "1"
}]
}],
"title": "Godess Creation System"
},
{
"images": ["http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/01.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/02.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/03.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/04.jpg?ak=p0skml"],
"link": "/mangas/reversal/1/Lolly-Pop",
"name": "1"
}]
Each function is individually fetching data properly, the only issue is JSON formation. It is not writing to json file properly. Please lead me where I am wrong.
Related
I have parsed XML from website and i found that it has two branches (children),
How to Separate the two branches into two lists of dictionaries,
here's my code so far:
import pandas as pd
import xml.etree.ElementTree as ET
import requests
url = "http://cs.stir.ac.uk/~soh/BD2spring2022/assignmentdata.php"
params = {'data':'spurpyr'}
response = requests.get (url, params)
tree = response.content
#extract the root element as separate variable, and display the root tag.
root = ET.fromstring(tree)
print(root.tag)
#Get attributes of root
root_attr = root.attrib
print(root_attr)
#Finding children of root
for child in root:
print(child.tag, child.attrib)
#extract the two children of the root element into another two separate variables, and display their tags as well
child_dict = []
for child in root:
child_dict.append(child.tag)
tweets_branch = child_dict[0]
cities_branch = child_dict[1]
#the elements in the entire tree
[elem.tag for elem in root.iter()]
#specify both the encoding and decoding of the document you are displaying as the string
print(ET.tostring(root, encoding='utf8').decode('utf8'))
Using beautifulsoup module. To parse tweets and cities to list of dictionaries you can use this example:
import requests
from bs4 import BeautifulSoup
url = "http://cs.stir.ac.uk/~soh/BD2spring2022/assignmentdata.php"
params = {"data": "spurpyr"}
soup = BeautifulSoup(requests.get(url, params=params).content, "xml")
tweets = []
for t in soup.select("tweets > tweet"):
tweets.append({"id": t["id"], **{x.name: x.text for x in t.find_all()}})
cities = []
for c in soup.select("cities > city"):
cities.append({"id": c["id"], **{x.name: x.text for x in c.find_all()}})
print(tweets)
print(cities)
Prints:
[
{
"id": "16620625 5686",
"Name": "Kenyon Conley",
"Phone": "0327 103 9485",
"Email": "malesuada#lobortisClassaptent.edu",
"Location": "45.5333, -73.2833",
"GenderID": "male",
"Tweet": "#FollowFriday #DanielleMorrill - She's with #Seattle20 and #Twilio. Also fun to talk to. #entrepreneur",
"City": "Saint-Basile-le-Grand",
"Country": "Canada",
"Age": "34",
},
{
"id": "16310427-5502",
"Name": "Griffin Norton",
"Phone": "0306 178 7917",
"Email": "in.dolor.Fusce#necmalesuadaut.ca",
"Location": "52.0000, 84.9833",
"GenderID": "male",
"Tweet": "!!!Veryy Bored!!! ~~Craving Million's Of MilkShakes~~",
"City": "Belokurikha",
"Country": "Russia",
"Age": "33",
},
...
the problem im facing here is that im trying to scrape this website of quotes
the website: https://quotes.toscrape.com/
what I intend to do is scrape the author's name,quote, and tags, and further simultaneously I want it to follow the (about) tag (which is a hyperlink) in each section and scrape the author's description and also his date of birth and save them all into a CSV file.
I've seen some similar questions on how to do something similar. But couldn't understand it clearly.
would love it if someone explained how to approach this problem, explaining how to use the meta/cb_kwargs, etc.
Heres, my code.
class QuoteSpider(scrapy.Spider):
name = "quotes"
start_urls = [
"https://quotes.toscrape.com/"
]
def parse(self, response):
for quote in response.css(".quote"):
author_link = response.css(".quote span a::attr(href)")
yield response.follow_all(author_link, callback=self.author_parse)
yield {
"author": quote.css(".author::text").get(),
"text": quote.css(".text::text").get(),
"tags": quote.css(".tags .tag::text").getall(),
}
def author_parse(self, response):
yield {
"dob": response.css(".author-born-date::text").get(),
"bio": response.css(".author-description::text").get(),
}
Using cb_kwargs is the preferred method right now:
def parse(self, response):
for quote in response.css(".quote"):
author_link = response.css(".quote span a::attr(href)")
author = {
"author": quote.css(".author::text").get(),
"text": quote.css(".text::text").get(),
"tags": quote.css(".tags .tag::text").getall(),
}
yield response.follow_all(author_link, callback=self.author_parse, cb_kwargs={'author': author})
def author_parse(self, response, author):
author["dob"] = response.css(".author-born-date::text").get()
author["bio"] = response.css(".author-description::text").get()
yield author
I have storing URLs in a database table:
scrapy_id | scrapy_name | url
------------+---------------+-----------------
111 | aaa | http://url1.com
222 | bbb | http://url2.com
333 | ccc | http://url3.com
I need start request from the urls, so I initial the database connection in open_spider of pipeline:
class PgsqlPipeline(object):
...
def open_spider(self, spider):
self.conn = psycopg2.connect(database=self.XXX, user=self.XXX, password=self.XXX)
self.cur = self.conn.cursor()
spider.myPipeline = self
def get_urls(self):
get_urls_sql = """
SOME_SQL_STATMENTS
"""
self.cur.execute(get_urls_sql)
rows = self.cur.fetchall()
return rows
...
then, in spider:
....
class SephoraSpider(Spider):
name = 'XXX'
allowed_domains = ['XXX']
def start_requests(self):
for row in self.myPipeline.get_urls():
self.item = SomeItem()
url = str(row[2])
self.item['id'] = row[0]
self.item['name'] = row[1]
yield Request(self.url, callback=self.parse_item)
def parse_item(self, response):
self.item['text'] = response.xpath('XXXX').get()
return self.item
....
in items:
....
class SomeItem(Item):
id = Field()
name = Field()
text = Field()
....
I want to get following item:
{
"id": 111,
"name": "aaa",
"text": response1,
},
{
"id": 222,
"name": "bbb",
"text": response2,
},
{
"id": 333,
"name": "ccc",
"text": response3,
}
But I get:
{
"id": 333,
"name": "ccc",
"text": response1,
},
{
"id": 333,
"name": "ccc",
"text": response2,
},
{
"id": 333,
"name": "ccc",
"text": response3,
}
The problem may be that I put self.item = SomeItem() in start_requests(), But if I put self.item = SomeItem() in parse_item(), I can not get id and name, It means that cannot match the resolved response with its ID.
How to match the ITEM with a record in the database?
You can't use self to store the request metadata because you are setting it only while starting the requests; you need the data to be persisted with the request, not with the SephoraSpider class instance. In the parse_item callback, it will just be set to the value of the last request you started. Instead, you can use the Request.meta field:
class SephoraSpider(Spider):
name = 'XXX'
allowed_domains = ['XXX']
def start_requests(self):
for row in self.myPipeline.get_urls():
url = str(row[2])
item = {'id': row[0], 'name': row[1], 'url': row[2]}
yield Request(self.url, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
item = response.meta['item']
item['text'] = response.xpath('XXXX').get()
return item
Details in the docs.
I'm learning how to work with Scrapy while refreshing my knowledge in Python?/Coding from school.
Currently, I'm playing around with imdb top 250 list but struggling with a JSON output file.
My current code is:
# -*- coding: utf-8 -*-
import scrapy
from top250imdb.items import Top250ImdbItem
class ActorsSpider(scrapy.Spider):
name = "actors"
allowed_domains = ["imdb.com"]
start_urls = ['http://www.imdb.com/chart/top']
# Parsing each movie and preparing the url for the actors list
def parse(self, response):
for film in response.css('.titleColumn'):
url = film.css('a::attr(href)').extract_first()
actors_url = 'http://imdb.com' + url[:17] + 'fullcredits?ref_=tt_cl_sm#cast'
yield scrapy.Request(actors_url, self.parse_actor)
# Finding all actors and storing them on item
# Refer to items.py
def parse_actor(self, response):
final_list = []
item = Top250ImdbItem()
item['poster'] = response.css('#main img::attr(src)').extract_first()
item['title'] = response.css('h3[itemprop~=name] a::text').extract()
item['photo'] = response.css('#fullcredits_content .loadlate::attr(loadlate)').extract()
item['actors'] = response.css('td[itemprop~=actor] span::text').extract()
final_list.append(item)
updated_list = []
for item in final_list:
for i in range(len(item['title'])):
sub_item = {}
sub_item['movie'] = {}
sub_item['movie']['poster'] = [item['poster']]
sub_item['movie']['title'] = [item['title'][i]]
sub_item['movie']['photo'] = [item['photo']]
sub_item['movie']['actors'] = [item['actors']]
updated_list.append(sub_item)
return updated_list
and my output file is giving me this JSON composition:
[
{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Shawshank Redemption"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Tim Robbins","Morgan Freeman",...]]}
},{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Godfather"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Alexandre Rodrigues", "Leandro Firmino", "Phellipe Haagensen",...]]}
}
]
but I'm looking to achieve this:
{
"movies": [{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Shawshank Redemption",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Tim Robbins"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Morgan Freeman"},...
]
},{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Godfather",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Marlon Brando"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Al Pacino"},...
]
}]
}
in my items.py file I have the following:
import scrapy
class Top250ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# Items from actors.py
poster = scrapy.Field()
title = scrapy.Field()
photo = scrapy.Field()
actors = scrapy.Field()
movie = scrapy.Field()
pass
I'm aware of the following things:
My results are not coming out in order, the 1st movie on web page list is always the first movie on my output file but the rest is not. I'm still working on that.
I can do the same thing but working with Top250ImdbItem(), still browsing around how that is done in a more detailed way.
This might not be the perfect layout for my JSON, suggestions are welcomed or if it is, let me know, even though I know there is no perfect way or "the only way".
Some actors don't have a photo and it actually loads a different CSS selector. For now, I would like to avoid reaching for the "no picture thumbnail" so it's ok to leave those items empty.
example:
{"photo": "", "name": "Al Pacino"}
Question: ... struggling with a JSON output file
Note: Can't use your ActorsSpider, get Error: Pseudo-elements are not supported.
# Define a `dict` **once**
top250ImdbItem = {'movies': []}
def parse_actor(self, response):
poster = response.css(...
title = response.css(...
photos = response.css(...
actors = response.css(...
# Assuming List of Actors are in sync with List of Photos
actors_list = []
for i, actor in enumerate(actors):
actors_list.append({"name": actor, "photo": photos[i]})
one_movie = {"poster": poster,
"title": title,
"actors": actors_list
}
# Append One Movie to Top250 'movies' List
top250ImdbItem['movies'].append(one_movie)
I want to get costs of mobile phones from this site
http://www.univercell.in/buy/SMART
i tried to test it so i used:
scarpy shell http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=2&VIEW_SIZE=15&serachupload=&sortupload=
But I am not able to connect to this site. As the page is loaded using ajax I found out the start_url using firebug. Can any one suggest me where I am going wrong
How about writing a JavaScript script to perform the actions that are already performed when clicking the page number and then simply dump the XML that is returned from the server. I mean try to make the calls to the server as if the site was hosted on your Desktop!
The JavaScript function called when you hit a number is paginateList('numberOfPage') where numberOfPage is the page you want to visit.
The body of the function is
function paginateList(viewIndex) {
var productCategoryId = document.pageSelect.category_id.value;
var viewSize = document.pageSelect.VIEW_SIZE.value;
var min = "";
if(document.pageSelect.min!=null)
min = document.pageSelect.min.value;
var max = "";
if(document.pageSelect.max!=null)
max = document.pageSelect.max.value;
var attrName = "";
if(document.pageSelect.attrName!=null)
attrName = document.pageSelect.attrName.value;
if(attrName==""){
var commaAttr=document.getElementById('commaAttr');
attrName=commaAttr.value;
}
var limitView = 'true';
var sortSearchPrice = "";
if(document.pageSelect.sortSearchPrice!=null)
sortSearchPrice = document.pageSelect.sortSearchPrice.value;
var url2="/control/AjaxCategoryDetail?productCategoryId="+productCategoryId+"&category_id="+productCategoryId+"&attrName="+attrName+"&min="+min+"&max="+max+"&sortSearchPrice="+sortSearchPrice+"&VIEW_INDEX="+viewIndex+"&VIEW_SIZE="+viewSize+"&serachupload=&sortupload=";
pleaseWait('Y');
jQuery.ajax({url: url2,
data: null,
type: 'post',
async: false,
success: function(data) {
$('#searchResult').html(data);
pleaseWait('N');
},
error: function(data) {
alert("Error during product searching");
}
});
Use these to get the data from each page recursively.
Hope it helps!
Here's your spider:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class UnivercellItem(Item):
vendor = Field()
model = Field()
price = Field()
BASE_URL = "http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=%s&VIEW_SIZE=15&serachupload=&sortupload="
class UnivercellSpider(BaseSpider):
name = "univercell_spider"
allowed_domains = ["www.univercell.in"]
start_urls = [BASE_URL % index for index in range(1, 21)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
mobiles = hxs.select("//div[#class='productsummary']")
print mobiles
for mobile in mobiles:
item = UnivercellItem()
item['vendor'] = mobile.select('.//div[1]/div/text()').extract()[0].strip()
item['model'] = mobile.select('.//div[3]/div[1]/a/text()').extract()[0].strip()
item['price'] = mobile.select('.//span[#class="regularPrice"]/span/text()').extract()[0].strip()
yield item
Save it to spider.py and run via scrapy runspider spider.py -o output.json. Then in output.json you will see:
{"model": "T375", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P725 Optimus 3D Max", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P705 Optimus L7", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "9320 Curve", "vendor": "Blackberry", "price": "Special Price Click Here"}
{"model": "Xperia Sola", "vendor": "Sony", "price": "Rs.14,500.00"}
{"model": "Xperia U", "vendor": "Sony", "price": "Special Price Click Here"}
{"model": "Lumia 610", "vendor": "Nokia", "price": "Special Price Click Here"}
...
Hope that helps.