I want to get costs of mobile phones from this site
http://www.univercell.in/buy/SMART
i tried to test it so i used:
scarpy shell http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=2&VIEW_SIZE=15&serachupload=&sortupload=
But I am not able to connect to this site. As the page is loaded using ajax I found out the start_url using firebug. Can any one suggest me where I am going wrong
How about writing a JavaScript script to perform the actions that are already performed when clicking the page number and then simply dump the XML that is returned from the server. I mean try to make the calls to the server as if the site was hosted on your Desktop!
The JavaScript function called when you hit a number is paginateList('numberOfPage') where numberOfPage is the page you want to visit.
The body of the function is
function paginateList(viewIndex) {
var productCategoryId = document.pageSelect.category_id.value;
var viewSize = document.pageSelect.VIEW_SIZE.value;
var min = "";
if(document.pageSelect.min!=null)
min = document.pageSelect.min.value;
var max = "";
if(document.pageSelect.max!=null)
max = document.pageSelect.max.value;
var attrName = "";
if(document.pageSelect.attrName!=null)
attrName = document.pageSelect.attrName.value;
if(attrName==""){
var commaAttr=document.getElementById('commaAttr');
attrName=commaAttr.value;
}
var limitView = 'true';
var sortSearchPrice = "";
if(document.pageSelect.sortSearchPrice!=null)
sortSearchPrice = document.pageSelect.sortSearchPrice.value;
var url2="/control/AjaxCategoryDetail?productCategoryId="+productCategoryId+"&category_id="+productCategoryId+"&attrName="+attrName+"&min="+min+"&max="+max+"&sortSearchPrice="+sortSearchPrice+"&VIEW_INDEX="+viewIndex+"&VIEW_SIZE="+viewSize+"&serachupload=&sortupload=";
pleaseWait('Y');
jQuery.ajax({url: url2,
data: null,
type: 'post',
async: false,
success: function(data) {
$('#searchResult').html(data);
pleaseWait('N');
},
error: function(data) {
alert("Error during product searching");
}
});
Use these to get the data from each page recursively.
Hope it helps!
Here's your spider:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class UnivercellItem(Item):
vendor = Field()
model = Field()
price = Field()
BASE_URL = "http://www.univercell.in/control/AjaxCategoryDetail?productCategoryId=PRO-SMART&category_id=PRO-SMART&attrName=&min=&max=&sortSearchPrice=&VIEW_INDEX=%s&VIEW_SIZE=15&serachupload=&sortupload="
class UnivercellSpider(BaseSpider):
name = "univercell_spider"
allowed_domains = ["www.univercell.in"]
start_urls = [BASE_URL % index for index in range(1, 21)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
mobiles = hxs.select("//div[#class='productsummary']")
print mobiles
for mobile in mobiles:
item = UnivercellItem()
item['vendor'] = mobile.select('.//div[1]/div/text()').extract()[0].strip()
item['model'] = mobile.select('.//div[3]/div[1]/a/text()').extract()[0].strip()
item['price'] = mobile.select('.//span[#class="regularPrice"]/span/text()').extract()[0].strip()
yield item
Save it to spider.py and run via scrapy runspider spider.py -o output.json. Then in output.json you will see:
{"model": "T375", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P725 Optimus 3D Max", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "P705 Optimus L7", "vendor": "LG", "price": "Special Price Click Here"}
{"model": "9320 Curve", "vendor": "Blackberry", "price": "Special Price Click Here"}
{"model": "Xperia Sola", "vendor": "Sony", "price": "Rs.14,500.00"}
{"model": "Xperia U", "vendor": "Sony", "price": "Special Price Click Here"}
{"model": "Lumia 610", "vendor": "Nokia", "price": "Special Price Click Here"}
...
Hope that helps.
Related
i am trying to scrape amazon's products using scrapy with the crawl template, but i found the amazon use some javascript to get some block of the product details, so i decide to use splash to render javascript, it works fine in the shell command, but i can't figure out how to implement it in my code.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class AmazonCrawlerSpider(CrawlSpider):
name = 'amazon_Crawler'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A502394&ref=nav_em__nav_desktop_sa_intl_camera_and_photo_0_2_5_3']
len_product_details = LinkExtractor(restrict_css='h2 > a')
product_details = Rule(len_product_details,
callback='parse_item', follow=False)
len_products_pagination = LinkExtractor(
restrict_xpaths='//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[37]/div/div/span/a[3]')
products_pagination = Rule(len_products_pagination, follow=True)
rules = (
product_details, products_pagination
)
def parse_item(self, response):
data = {
"categorie_0": response.xpath('//*[#id="wayfinding-breadcrumbs_feature_div"]/ul/li[1]/span/a/text()').get(),
"categorie_1": response.xpath('//*[#id="wayfinding-breadcrumbs_feature_div"]/ul/li[3]/span/a/text()').get(),
"title": response.css('h1 > span ::text').get(),
"price": response.xpath('//div[#id="corePrice_feature_div"]/div/span/span[1]//text()').get(),
"amazon_globale_shiping": response.xpath('//*[#id="a-popover-content-2"]/table/tbody/tr[2]/td[3]/span/text()').get(),
"estimated_import_fees_deposit": response.xpath('//*[#id="a-popover-content-2"]/table/tbody/tr[3]/td[3]/span/text()').get(),
"total": response.xpath('//*[#id="a-popover-content-2"]/table/tbody/tr[5]/td[3]/span/text()').get(),
"delevery_period": response.xpath('//*[#id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()').get(),
"delevery_destination": response.xpath('//*[#id="contextualIngressPtLabel_deliveryShortLine"]/span[2]/text()').get(),
"in_stock": response.xpath('//*[#id="availability"]/span/text()').get(),
"quantity": "not_exist",
"ship_from": response.xpath('//*[#id="tabular-buybox"]/div[1]/div[2]/div/span/text()').get(),
"sold_by": {
"name": response.xpath('//*[#id="sellerProfileTriggerId"]/text()').get(),
'store_url': response.xpath('//*[#id="sellerProfileTriggerId"]/#href').get(),
'packaging': response.xpath('//*[#id="tabular-buybox"]/div[1]/div[6]/div/span/text()').get()
},
"description": response.xpath('//*[#id="productDescription"]/p/text()').get(),
# "brand": response.xpath('//*[#id="productOverview_feature_div"]/div/table/tbody/tr[1]/td[2]/span/text()').get(),
"is_returned": response.xpath('//*[#id="productSupportAndReturnPolicy-return-policy-popover-celWidget"]/div/div[1]/text()').get(),
"extra_info": [],
"details": [],
"about_this_item": [],
"note": response.xpath('//*[#id="universal-product-alert"]/div/span[2]/text()').get(),
"Q_AW": [],
"Customer_reviews": {
"customer_rate": response.xpath('//*[#id="reviewsMedley"]/div/div[1]/div[2]/div[1]/div/div[2]/div/span/span/text()').get(),
"total_rate": response.xpath('//*[#id="reviewsMedley"]/div/div[1]/div[2]/div[2]/span/text()').get(),
"global_rate": {
"1_star": response.xpath('//*[#id="histogramTable"]/tbody/tr[5]/td[3]/span[2]/a/text()').get(),
"2_star": response.xpath('//*[#id="histogramTable"]/tbody/tr[4]/td[3]/span[2]/a/text()').get(),
"3_star": response.xpath('//*[#id="histogramTable"]/tbody/tr[3]/td[3]/span[2]/a/text()').get(),
"4_star": response.xpath('//*[#id="histogramTable"]/tbody/tr[2]/td[3]/span[2]/a/text()').get(),
"5_star": response.xpath('//*[#id="histogramTable"]/tbody/tr[1]/td[3]/span[2]/a/text()').get(),
},
"rate_by_feature": [],
"product_reviews": []
},
"url": response.url
}
for reveiw in response.xpath('//*[#id="cm-cr-dp-review-list"]/div'):
data["Customer_reviews"]["product_reviews"].append(
{
"rate": reveiw.xpath('/div/div/div[2]/a/i/span/text()').get(),
"feature": reveiw.xpath('div/div/div[2]/a[2]/span/text()').get(),
"date_from": reveiw.xpath('div/div/span/text()').get(),
"verified": reveiw.xpath('div/div/div[3]/span[2]/text()').get(),
"review": reveiw.xpath('div/div/div[4]/span/div/div[1]/span/text()').get(),
'view_reaction': reveiw.xpath('div/div/div[5]/span[1]/div[1]/span/text()').get()
}
)
for cr_rf in response.xpath('//*[#id="cr-summarization-attributes-list"]/div'):
data["Customer_reviews"]["rate_by_feature"].append(
{
"key": cr_rf.xpath('div/div/div/div/span/text()').get(),
"value": response.xpath('div/div/div[2]/span[2]/text()').get()
}
)
for Q_AW in response.xpath('//*[#id="ask-btf-container"]/div/div/div[2]/span/div/div'):
data["Q_AW"].append(
{
"Question": Q_AW.xpath('div/div[2]/div/div/div[2]/a/span/text()').get(),
"Answer": Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span/span[2]/text()').get(),
"vote": Q_AW.xpath('div/div/ul/li[2]/span[1]/text()').get(),
"date_answer": Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span[3]/text()').get()
}
)
for extra_info in response.xpath('//*[#id="productDetails_detailBullets_sections1"]/tbody/tr'):
data["extra_info"].append(
{
"1": extra_info.css('th::text').get(),
"2": extra_info.css('td::text').get()
}
)
for index, about_this_item in enumerate(response.xpath('//*[#id="feature-bullets"]/ul/li')):
data["about_this_item"].append(
{
index+1: about_this_item.xpath('span/text()').get(),
}
)
for extra in response.xpath('//*[#id="productOverview_feature_div"]/div/table/tbody/tr'):
data['details'].append(
{
extra.xpath('td[1]/span/text()').get(): extra.css('td[2]/span/text()').get()
}
)
yield data
I think you have a problem in line 20, you forgot to define the correct function, knowing that the previous loop was undefined brojola !!
I'm trying to learn to log in to my spider. For that, I created the attached code. The expected outcome is:
{
"username": "willingc",
"email": "carolcode#willingconsulting.com",
"url": "https://www.willingconsulting.com",
}
However, the actual outcome is:
{
"username": "willingc",
"email": None,
"url": "https://www.willingconsulting.com",
}
None usually occurs if the browser isn't logged in. Do you see any errors in my code? The only indicator I see that something is wrong, is the following WARNING:
WARNING:py.warnings:/workspace/.pip-modules/lib/python3.8/site-packages/scrapy/spidermiddlewares/referer.py:287:
RuntimeWarning: Could not load referrer policy
'origin-when-cross-origin, strict-origin-when-cross-origin'
import scrapy
from scrapy.http import FormRequest
class GitHubSpider(scrapy.Spider):
name = "github"
allowed_domains = ["github.com"]
start_urls = ["https://github.com/login"]
def parse(self, response):
token = response.xpath('//form/input[#name="authenticity_token"]/#value').get()
return FormRequest.from_response(
response,
formdata={
"authenticity_token": token,
"login": "mygithub#gmail.com",
"password": "12345",
},
callback=self.parse_after_login,
)
def parse_after_login(self, response):
yield scrapy.Request(
url="https://github.com/willingc",
callback=self.parse_engineer,
)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
"email": response.xpath('//li[#itemprop="email"]/a//text()').get(),
"url": response.xpath('//li[#itemprop="url"]/a//#href').get(),
}
I'm trying to scrape a web page with scrapy I noticed it won't work when I parsed web page throw my ipython shell it returned this:
'دانلود کتاب و کتاب صوتی با طاقچه\n // more info: http://angulartics.github.io/\n (function (i, s, o, g, r, a, m) {\n i[\'GoogleAnalyticsObject\'] = r; i[r] = i[r] || function () {\n (i[r].q = i[r].q || []).push(arguments)\n }, i[r].l = 1 * new Date(); a = s.createElement(o),\n m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)\n })(window, document, \'script\', \'//www.google-analytics.com/analytics.js\', \'ga\');\n ga(\'create\', \'UA-57199074-1\', { \'cookieDomain\': location.hostname == \'localhost\' ? \'none\' : \'auto\' });\n ga(\'require\', \'ec\');\n Taaghche works best with JavaScript enabled{ "#context": "http://schema.org", "#type": "WebSite", "url": "https://taaghche.ir/", "name": "طاقچه", "alternateName": "نزدیکترین کتاب فروشی شهر", "potentialAction": { "#type": "SearchAction", "target": "https://taaghche.ir/search?term={search_term_string}", "query-input": "required name=search_term_string" } }{ "#context": "http://schema.org", "#type": "Organization", "url": "https://taaghche.ir", "logo": "https://taaghche.ir/assets/images/taaghchebrand.png", "contactPoint": [{ "#type": "ContactPoint", "telephone": "+۹۸-۲۱-۸۸۱۴۹۸۱۶", "contacttype": "customer support", "areaServed": "IR" }] }'
more like a json response. how can I scrape throw it? by the way my scraper looks like this:
class Taaghche(scrapy.Spider):
name='taaghche'
def start_requests(self):
urls = []
link = 'https://taaghche.ir/search?term='
data = pd.read_csv('books.csv')
titles = data.title
for title in titles:
key = title.replace(" ", "%20")
urls.append(link+key)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_front)
def parse_front(self,response):
booklinks = response.xpath('//a[#class="book-link"][1]/#href').extract_first()
#print(booklinks)
#for booklink in booklinks:
yield response.follow(url =booklinks, callback=self.parse_page)
def parse_page(self,response):
...
The Website content is not render by server side.The Content of the website is rendered by JavaScript:
In this case you need use either.
Selenium (Integrate Selenium with scrapy )
Check request url in network tab. There might be API url and you can get data from url.
There might be other possible Solutions.
I am new to Python Scrapy and I am trying to create JSON file from 3 levels of nested pages. I have following structure:
Page 1 (start): contains links of second page (called Mangas)
Page 2: Contains nested Volumes and Chapters
Page 3: Each Chapter contains multiple images
My Code
import scrapy
import time
import items
import json
class GmangaSpider(scrapy.Spider):
name = "gmanga"
start_urls = [
"http://gmanga.me/mangas"
]
def parse(self, response):
# mangas = []
for manga in response.css('div.manga-item'):
link = manga.css('a.manga-item-content').xpath('#href').extract_first()
if link:
page_link = "http://gmanga.me%s" % link
mangas = items.Manga()
mangas['cover'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#src').extract_first()
mangas['title'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('#alt').extract_first()
mangas['link'] = page_link
mangas['volumes'] = []
yield scrapy.Request(page_link, callback=self.parse_volumes, meta = {"mangas": mangas})
def parse_volumes(self, response):
mangas = response.meta['mangas']
for manga in response.css('div.panel'):
volume = items.Volume()
volume['name'] = manga.css('div.panel-heading .panel-title a::text').extract_first()
volume['chapters'] = []
for tr in manga.css('div.panel-collapse .panel-body table tbody tr'):
chapter = items.Chapter()
chapter['name'] = tr.css('td:nth_child(1) div::text').extract_first()
chapter_link = tr.css('td:nth_child(3) a::attr("href")').extract_first()
chapter['link'] = chapter_link
request = scrapy.Request("http://gmanga.me%s" % chapter_link, callback = self.parse_images, meta = {"chapter": chapter})
yield request
volume['chapters'].append(chapter)
mangas['volumes'].append(volume)
yield mangas
def parse_images(self, response):
chapter = response.meta['chapter']
data = response.xpath("//script").re("alphanumSort\((.*])")
if data:
images = json.loads(data[0])
chapter['images'] = images
return chapter
My Items.py
from scrapy import Item, Field
class Manga(Item):
title = Field()
cover = Field()
link = Field()
volumes = Field()
class Volume(Item):
name = Field()
chapters = Field()
class Chapter(Item):
name = Field()
images = Field()
link = Field()
Now I am bit confused in parse_volumes function where to yield or return to get following structure in json file.
Expected Result:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/151/medium_143061.jpg",
"link": "http://gmanga.me/mangas/gokko",
"volumes": [{
"name": "xyz",
"chapters": [{
"link": "/mangas/gokko/4/3asq",
"name": "4",
"images": ["img1.jpg", "img2.jpg"]
}, {
"link": "/mangas/gokko/3/3asq",
"name": "3",
"images": ["img1.jpg", "img2.jpg"]
}]
}],
"title": "Gokko"
}]
But I am getting images node as separate node it must be within chapters node of volume:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/10581/medium_I2.5HFzVh7e.png",
"link": "http://gmanga.me/mangas/godess-creation-system",
"volumes": [{
"name": "\u0627\u0644\u0645\u062c\u0644\u062f ",
"chapters": [{
"link": "/mangas/godess-creation-system/1/ayou-cahn",
"name": "1"
}]
}],
"title": "Godess Creation System"
},
{
"images": ["http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/01.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/02.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/03.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/04.jpg?ak=p0skml"],
"link": "/mangas/reversal/1/Lolly-Pop",
"name": "1"
}]
Each function is individually fetching data properly, the only issue is JSON formation. It is not writing to json file properly. Please lead me where I am wrong.
I'm learning how to work with Scrapy while refreshing my knowledge in Python?/Coding from school.
Currently, I'm playing around with imdb top 250 list but struggling with a JSON output file.
My current code is:
# -*- coding: utf-8 -*-
import scrapy
from top250imdb.items import Top250ImdbItem
class ActorsSpider(scrapy.Spider):
name = "actors"
allowed_domains = ["imdb.com"]
start_urls = ['http://www.imdb.com/chart/top']
# Parsing each movie and preparing the url for the actors list
def parse(self, response):
for film in response.css('.titleColumn'):
url = film.css('a::attr(href)').extract_first()
actors_url = 'http://imdb.com' + url[:17] + 'fullcredits?ref_=tt_cl_sm#cast'
yield scrapy.Request(actors_url, self.parse_actor)
# Finding all actors and storing them on item
# Refer to items.py
def parse_actor(self, response):
final_list = []
item = Top250ImdbItem()
item['poster'] = response.css('#main img::attr(src)').extract_first()
item['title'] = response.css('h3[itemprop~=name] a::text').extract()
item['photo'] = response.css('#fullcredits_content .loadlate::attr(loadlate)').extract()
item['actors'] = response.css('td[itemprop~=actor] span::text').extract()
final_list.append(item)
updated_list = []
for item in final_list:
for i in range(len(item['title'])):
sub_item = {}
sub_item['movie'] = {}
sub_item['movie']['poster'] = [item['poster']]
sub_item['movie']['title'] = [item['title'][i]]
sub_item['movie']['photo'] = [item['photo']]
sub_item['movie']['actors'] = [item['actors']]
updated_list.append(sub_item)
return updated_list
and my output file is giving me this JSON composition:
[
{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Shawshank Redemption"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Tim Robbins","Morgan Freeman",...]]}
},{
"movie": {
"poster": ["https://images-na.ssl-images-amazon.com/poster..."],
"title": ["The Godfather"],
"photo": [["https://images-na.ssl-images-amazon.com/photo..."]],
"actors": [["Alexandre Rodrigues", "Leandro Firmino", "Phellipe Haagensen",...]]}
}
]
but I'm looking to achieve this:
{
"movies": [{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Shawshank Redemption",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Tim Robbins"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Morgan Freeman"},...
]
},{
"poster": "https://images-na.ssl-images-amazon.com/poster...",
"title": "The Godfather",
"actors": [
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Marlon Brando"},
{"photo": "https://images-na.ssl-images-amazon.com/photo...",
"name": "Al Pacino"},...
]
}]
}
in my items.py file I have the following:
import scrapy
class Top250ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# Items from actors.py
poster = scrapy.Field()
title = scrapy.Field()
photo = scrapy.Field()
actors = scrapy.Field()
movie = scrapy.Field()
pass
I'm aware of the following things:
My results are not coming out in order, the 1st movie on web page list is always the first movie on my output file but the rest is not. I'm still working on that.
I can do the same thing but working with Top250ImdbItem(), still browsing around how that is done in a more detailed way.
This might not be the perfect layout for my JSON, suggestions are welcomed or if it is, let me know, even though I know there is no perfect way or "the only way".
Some actors don't have a photo and it actually loads a different CSS selector. For now, I would like to avoid reaching for the "no picture thumbnail" so it's ok to leave those items empty.
example:
{"photo": "", "name": "Al Pacino"}
Question: ... struggling with a JSON output file
Note: Can't use your ActorsSpider, get Error: Pseudo-elements are not supported.
# Define a `dict` **once**
top250ImdbItem = {'movies': []}
def parse_actor(self, response):
poster = response.css(...
title = response.css(...
photos = response.css(...
actors = response.css(...
# Assuming List of Actors are in sync with List of Photos
actors_list = []
for i, actor in enumerate(actors):
actors_list.append({"name": actor, "photo": photos[i]})
one_movie = {"poster": poster,
"title": title,
"actors": actors_list
}
# Append One Movie to Top250 'movies' List
top250ImdbItem['movies'].append(one_movie)