scrapy spider not returning any results - python

This is my first attempt to create a spider, kindly spare me if I have not done it properly.
Here is the link to the website I am trying to extract data from. http://www.4icu.org/in/. I want the entire list of colleges that is being displayed on the page. But when I run the following spider I am returned with an empty json file.
my items.py
import scrapy
class CollegesItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
This is the spider
colleges.py
import scrapy
from scrapy.spider import Spider
from scrapy.http import Request
class CollegesItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
class CollegesSpider(Spider):
name = 'colleges'
allowed_domains = ["4icu.org"]
start_urls = ('http://www.4icu.org/in/',)
def parse(self, response):
return Request(
url = "http://www.4icu.org/in/",
callback = self.parse_fixtures
)
def parse_fixtures(self,response):
sel = response.selector
for div in sel.css("col span_2_of_2>div>tbody>tr"):
item = Fixture()
item['university.name'] = tr.xpath('td[#class="i"]/span /a/text()').extract()
yield item

As stated in the comment for the question there are some issues with your code.
First of all, you do not need two methods -- because in the parse method you call the same URL as you did in start_urls.
To get some information from the site try using the following code:
def parse(self, response):
for tr in response.xpath('//div[#class="section group"][5]/div[#class="col span_2_of_2"][1]/table//tr'):
if tr.xpath(".//td[#class='i']"):
name = tr.xpath('./td[1]/a/text()').extract()[0]
location = tr.xpath('./td[2]//text()').extract()[0]
print name, location
and adjust it to your needs to fill your item (or items).
As you can see, your browser displays an additional tbody in the table which is not present when you scrape with Scrapy. This means you often need to judge what you see in the browser.

Here is the working code
import scrapy
from scrapy.spider import Spider
from scrapy.http import Request
class CollegesItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
location = scrapy.Field()
class CollegesSpider(Spider):
name = 'colleges'
allowed_domains = ["4icu.org"]
start_urls = ('http://www.4icu.org/in/',)
def parse(self, response):
for tr in response.xpath('//div[#class="section group"] [5]/div[#class="col span_2_of_2"][1]/table//tr'):
if tr.xpath(".//td[#class='i']"):
item = CollegesItem()
item['name'] = tr.xpath('./td[1]/a/text()').extract()[0]
item['location'] = tr.xpath('./td[2]//text()').extract()[0]
yield item
after running the command
spider
>>scrapy crawl colleges -o mait.json
Following is the snippet of results:
[[[[[[[{"name": "Indian Institute of Technology Bombay", "location": "Mumbai"},
{"name": "Indian Institute of Technology Madras", "location": "Chennai"},
{"name": "University of Delhi", "location": "Delhi"},
{"name": "Indian Institute of Technology Kanpur", "location": "Kanpur"},
{"name": "Anna University", "location": "Chennai"},
{"name": "Indian Institute of Technology Delhi", "location": "New Delhi"},
{"name": "Manipal University", "location": "Manipal ..."},
{"name": "Indian Institute of Technology Kharagpur", "location": "Kharagpur"},
{"name": "Indian Institute of Science", "location": "Bangalore"},
{"name": "Panjab University", "location": "Chandigarh"},
{"name": "National Institute of Technology, Tiruchirappalli", "location": "Tiruchirappalli"}, .........

Related

Scraping the data from the links simultaneously along with the data from the main page in scrapy

the problem im facing here is that im trying to scrape this website of quotes
the website: https://quotes.toscrape.com/
what I intend to do is scrape the author's name,quote, and tags, and further simultaneously I want it to follow the (about) tag (which is a hyperlink) in each section and scrape the author's description and also his date of birth and save them all into a CSV file.
I've seen some similar questions on how to do something similar. But couldn't understand it clearly.
would love it if someone explained how to approach this problem, explaining how to use the meta/cb_kwargs, etc.
Heres, my code.
class QuoteSpider(scrapy.Spider):
name = "quotes"
start_urls = [
"https://quotes.toscrape.com/"
]
def parse(self, response):
for quote in response.css(".quote"):
author_link = response.css(".quote span a::attr(href)")
yield response.follow_all(author_link, callback=self.author_parse)
yield {
"author": quote.css(".author::text").get(),
"text": quote.css(".text::text").get(),
"tags": quote.css(".tags .tag::text").getall(),
}
def author_parse(self, response):
yield {
"dob": response.css(".author-born-date::text").get(),
"bio": response.css(".author-description::text").get(),
}
Using cb_kwargs is the preferred method right now:
def parse(self, response):
for quote in response.css(".quote"):
author_link = response.css(".quote span a::attr(href)")
author = {
"author": quote.css(".author::text").get(),
"text": quote.css(".text::text").get(),
"tags": quote.css(".tags .tag::text").getall(),
}
yield response.follow_all(author_link, callback=self.author_parse, cb_kwargs={'author': author})
def author_parse(self, response, author):
author["dob"] = response.css(".author-born-date::text").get()
author["bio"] = response.css(".author-description::text").get()
yield author

Python Scrapy Multilevel Request (Three Level)

I am new to Python Scrapy and I am trying to create JSON file from 3 levels of nested pages. I have following structure:
Page 1 (start): contains links of second page (called Mangas)
Page 2: Contains nested Volumes and Chapters
Page 3: Each Chapter contains multiple images
My Code
import scrapy
import time
import items
import json
class GmangaSpider(scrapy.Spider):
name = "gmanga"
start_urls = [
"http://gmanga.me/mangas"
]
def parse(self, response):
# mangas = []
for manga in response.css('div.manga-item'):
link = manga.css('a.manga-item-content').xpath('#href').extract_first()
if link:
page_link = "http://gmanga.me%s" % link
mangas = items.Manga()
mangas['cover'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#src').extract_first()
mangas['title'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#alt').extract_first()
mangas['link'] = page_link
mangas['volumes'] = []
yield scrapy.Request(page_link, callback=self.parse_volumes, meta = {"mangas": mangas})
def parse_volumes(self, response):
mangas = response.meta['mangas']
for manga in response.css('div.panel'):
volume = items.Volume()
volume['name'] = manga.css('div.panel-heading .panel-title a::text').extract_first()
volume['chapters'] = []
for tr in manga.css('div.panel-collapse .panel-body table tbody tr'):
chapter = items.Chapter()
chapter['name'] = tr.css('td:nth_child(1) div::text').extract_first()
chapter_link = tr.css('td:nth_child(3) a::attr("href")').extract_first()
chapter['link'] = chapter_link
request = scrapy.Request("http://gmanga.me%s" % chapter_link, callback = self.parse_images, meta = {"chapter": chapter})
yield request
volume['chapters'].append(chapter)
mangas['volumes'].append(volume)
yield mangas
def parse_images(self, response):
chapter = response.meta['chapter']
data = response.xpath("//script").re("alphanumSort\((.*])")
if data:
images = json.loads(data[0])
chapter['images'] = images
return chapter
My Items.py
from scrapy import Item, Field
class Manga(Item):
title = Field()
cover = Field()
link = Field()
volumes = Field()
class Volume(Item):
name = Field()
chapters = Field()
class Chapter(Item):
name = Field()
images = Field()
link = Field()
Now I am bit confused in parse_volumes function where to yield or return to get following structure in json file.
Expected Result:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/151/medium_143061.jpg",
"link": "http://gmanga.me/mangas/gokko",
"volumes": [{
"name": "xyz",
"chapters": [{
"link": "/mangas/gokko/4/3asq",
"name": "4",
"images": ["img1.jpg", "img2.jpg"]
}, {
"link": "/mangas/gokko/3/3asq",
"name": "3",
"images": ["img1.jpg", "img2.jpg"]
}]
}],
"title": "Gokko"
}]
But I am getting images node as separate node it must be within chapters node of volume:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/10581/medium_I2.5HFzVh7e.png",
"link": "http://gmanga.me/mangas/godess-creation-system",
"volumes": [{
"name": "\u0627\u0644\u0645\u062c\u0644\u062f ",
"chapters": [{
"link": "/mangas/godess-creation-system/1/ayou-cahn",
"name": "1"
}]
}],
"title": "Godess Creation System"
},
{
"images": ["http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/01.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/02.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/03.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/04.jpg?ak=p0skml"],
"link": "/mangas/reversal/1/Lolly-Pop",
"name": "1"
}]
Each function is individually fetching data properly, the only issue is JSON formation. It is not writing to json file properly. Please lead me where I am wrong.

How to produce custom JSON output from Scrapy?

I am working on a Scrapy script which should make output like:
{
"state": "FL",
"date": "2017-11-03T14:52:26.007Z",
"games": [
{
"name":"Game1"
},
{
"name":"Game2"
}
]
}
But for me it is making as below when I run scrapy crawl items -o data.json -t json. The repetition of state
[
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
]
The code is given below:
import scrapy
items.py
class Item(scrapy.Item):
state = scrapy.Field()
games = scrapy.Field()
In Spider file, item class is called as:
item = Item()
item['state'] = state
item['Date'] = '2014-10-04'
item['games'] = games
I know this is not complete code but it should give an idea what I am all about.
Ref. https://stackoverflow.com/a/43698923/8964297
You could try to write your own pipeline like this:
Put this into your pipelines.py file:
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('scraped_items.json', 'w')
# Your scraped items will be saved in the file 'scraped_items.json'.
# You can change the filename to whatever you want.
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
line = json.dumps(
dict(item),
indent = 4,
sort_keys = True,
separators = (',', ': ')
) + ",\n"
self.file.write(line)
return item
Then modify your settings.py to include the following:
ITEM_PIPELINES = {
'YourSpiderName.pipelines.JsonWriterPipeline': 300,
}
Change YourSpiderName to the correct name of your spider.
Note that the file gets written directly by the pipeline, so you don't have to specify file and format with the -o and -t command line parameters.
Hope this gets you closer to what you need.

Scrapy - Creating nested JSON Object

I'm learning how to work with Scrapy while refreshing my knowledge in Python?/Coding from school.
Currently, I'm playing around with imdb top 250 list but struggling with a JSON output file.
My current code is:
# -*- coding: utf-8 -*-
import scrapy
from top250imdb.items import Top250ImdbItem
class ActorsSpider(scrapy.Spider):
name = "actors"
allowed_domains = ["imdb.com"]
start_urls = ['http://www.imdb.com/chart/top']
# Parsing each movie and preparing the url for the actors list
def parse(self, response):
for film in response.css('.titleColumn'):
url = film.css('a::attr(href)').extract_first()
actors_url = 'http://imdb.com' + url[:17] + 'fullcredits?ref_=tt_cl_sm#cast'
yield scrapy.Request(actors_url, self.parse_actor)
# Finding all actors and storing them on item
# Refer to items.py
def parse_actor(self, response):
final_list = []
item = Top250ImdbItem()
item['poster'] = response.css('#main img::attr(src)').extract_first()
item['title'] = response.css('h3[itemprop~=name] a::text').extract()
item['photo'] = response.css('#fullcredits_content .loadlate::attr(loadlate)').extract()
item['actors'] = response.css('td[itemprop~=actor] span::text').extract()
final_list.append(item)
updated_list = []
for item in final_list:
for i in range(len(item['title'])):
sub_item = {}
sub_item['movie'] = {}
sub_item['movie']['poster'] = [item['poster']]
sub_item['movie']['title'] = [item['title'][i]]
sub_item['movie']['photo'] = [item['photo']]
sub_item['movie']['actors'] = [item['actors']]
updated_list.append(sub_item)
return updated_list
and my output file is giving me this JSON composition:
[
{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Shawshank Redemption"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Tim Robbins","Morgan Freeman",...]]}
},{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Godfather"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Alexandre Rodrigues", "Leandro Firmino", "Phellipe Haagensen",...]]}
}
]
but I'm looking to achieve this:
{
"movies": [{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Shawshank Redemption",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Tim Robbins"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Morgan Freeman"},...
]
},{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Godfather",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Marlon Brando"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Al Pacino"},...
]
}]
}
in my items.py file I have the following:
import scrapy
class Top250ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# Items from actors.py
poster = scrapy.Field()
title = scrapy.Field()
photo = scrapy.Field()
actors = scrapy.Field()
movie = scrapy.Field()
pass
I'm aware of the following things:
My results are not coming out in order, the 1st movie on web page list is always the first movie on my output file but the rest is not. I'm still working on that.
I can do the same thing but working with Top250ImdbItem(), still browsing around how that is done in a more detailed way.
This might not be the perfect layout for my JSON, suggestions are welcomed or if it is, let me know, even though I know there is no perfect way or "the only way".
Some actors don't have a photo and it actually loads a different CSS selector. For now, I would like to avoid reaching for the "no picture thumbnail" so it's ok to leave those items empty.
example:
{"photo": "", "name": "Al Pacino"}
Question: ... struggling with a JSON output file
Note: Can't use your ActorsSpider, get Error: Pseudo-elements are not supported.
# Define a `dict` **once**
top250ImdbItem = {'movies': []}
def parse_actor(self, response):
poster = response.css(...
title = response.css(...
photos = response.css(...
actors = response.css(...
# Assuming List of Actors are in sync with List of Photos
actors_list = []
for i, actor in enumerate(actors):
actors_list.append({"name": actor, "photo": photos[i]})
one_movie = {"poster": poster,
"title": title,
"actors": actors_list
}
# Append One Movie to Top250 'movies' List
top250ImdbItem['movies'].append(one_movie)

how to retrieve data ... The page is loaded using ajax

I want to get costs of mobile phones from this site
http://www.univercell.in/buy/SMART
i tried to test it so i used:
scarpy shell http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=2&VIEW_SIZE=15&serachupload=&sortupload=
But I am not able to connect to this site. As the page is loaded using ajax I found out the start_url using firebug. Can any one suggest me where I am going wrong
How about writing a JavaScript script to perform the actions that are already performed when clicking the page number and then simply dump the XML that is returned from the server. I mean try to make the calls to the server as if the site was hosted on your Desktop!
The JavaScript function called when you hit a number is paginateList('numberOfPage') where numberOfPage is the page you want to visit.
The body of the function is
function paginateList(viewIndex) {
var productCategoryId = document.pageSelect.category_id.value;
var viewSize = document.pageSelect.VIEW_SIZE.value;
var min = "";
if(document.pageSelect.min!=null)
min = document.pageSelect.min.value;
var max = "";
if(document.pageSelect.max!=null)
max = document.pageSelect.max.value;
var attrName = "";
if(document.pageSelect.attrName!=null)
attrName = document.pageSelect.attrName.value;
if(attrName==""){
var commaAttr=document.getElementById('commaAttr');
attrName=commaAttr.value;
}
var limitView = 'true';
var sortSearchPrice = "";
if(document.pageSelect.sortSearchPrice!=null)
sortSearchPrice = document.pageSelect.sortSearchPrice.value;
var url2="/control/AjaxCategoryDetail?productCategoryId="+productCategoryId+"&category_id="+productCategoryId+"&attrName="+attrName+"&min="+min+"&max="+max+"&sortSearchPrice="+sortSearchPrice+"&VIEW_INDEX="+viewIndex+"&VIEW_SIZE="+viewSize+"&serachupload=&sortupload=";
pleaseWait('Y');
jQuery.ajax({url: url2,
data: null,
type: 'post',
async: false,
success: function(data) {
$('#searchResult').html(data);
pleaseWait('N');
},
error: function(data) {
alert("Error during product searching");
}
});
Use these to get the data from each page recursively.
Hope it helps!
Here's your spider:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class UnivercellItem(Item):
vendor = Field()
model = Field()
price = Field()
BASE_URL = "http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=%s&VIEW_SIZE=15&serachupload=&sortupload="
class UnivercellSpider(BaseSpider):
name = "univercell_spider"
allowed_domains = ["www.univercell.in"]
start_urls = [BASE_URL % index for index in range(1, 21)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
mobiles = hxs.select("//div[#class='productsummary']")
print mobiles
for mobile in mobiles:
item = UnivercellItem()
item['vendor'] = mobile.select('.//div[1]/div/text()').extract()[0].strip()
item['model'] = mobile.select('.//div[3]/div[1]/a/text()').extract()[0].strip()
item['price'] = mobile.select('.//span[#class="regularPrice"]/span/text()').extract()[0].strip()
yield item
Save it to spider.py and run via scrapy runspider spider.py -o output.json. Then in output.json you will see:
{"model": "T375", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P725 Optimus 3D Max", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P705 Optimus L7", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "9320 Curve", "vendor": "Blackberry", "price": "Special Price Click Here"}
{"model": "Xperia Sola", "vendor": "Sony", "price": "Rs.14,500.00"}
{"model": "Xperia U", "vendor": "Sony", "price": "Special Price Click Here"}
{"model": "Lumia 610", "vendor": "Nokia", "price": "Special Price Click Here"}
...
Hope that helps.

Categories

Resources