flask app doesn't use scrapy custom settings - python

I have a perfectly functioning spider. I am integrating it into a flask web app. The spider runs when I want it to, but it doesn't use the settings provided in the settings.py file when ran from the flask app. Hence, I get a 403 response. Note that when I run the spider from the cmd, it uses the settings just fine. How can I get the flask app to run the spider WITH the correct settings?
My spider code:
class PostsSpider(scrapy.Spider):
page_number = 0
name = 'posts'
allowed_domains = ['ra.co']
# global artist
# artist = input('Artist name:')
start_urls = [
f'https://ra.co/dj/ouladomar/past-events?'
]
}
def parse(self, response):
if len(response.css('li.Column-sc-18hsrnn-0.inVJeD')) == 0:
raise CloseSpider('No more events')
for link in response.css('li.Column-sc-18hsrnn-0.inVJeD div h3 a::attr(href)'):
yield response.follow(link.get(), callback=self.parse_act)
next_page = f'https://ra.co/dj/ouladomar/past-events?page={str(PostsSpider.page_number)}'
PostsSpider.page_number += 1
print(PostsSpider.page_number)
yield response.follow(next_page, callback=self.parse)
def parse_act(self, response):
date = response.xpath('//*[#id="__next"]/div[2]/header/div/div[2]//div/ul/li[2]/div/div[2]/a/span/text()').get()
event = response.xpath('//*[#id="__next"]/div[2]/header/div//div/div/div[2]/h1/span/text()').get()
promotors = response.xpath('//*[#id="__next"]/div[2]/header/div/div[2]/div[2]/div/ul/li[3]/div/div[2]/a/span/text()').getall()
location = response.xpath('//*[#id="__next"]/div[2]/header/div//div[1]/div/div/div[1]/nav/ul/li[1]/div/a/span/text()').get()
country = response.xpath('//*[#id="__next"]/div[2]/header/div//div[1]/div/div/div[1]/nav/ul/li[1]/div/a').attrib['href']
venue = response.xpath('//*[#id="__next"]/div[2]/header/div/div[2]//div/ul/li[1]/div//span/text()')[1].get()
acts = response.xpath('//*[#id="__next"]/div[2]/section[1]/div/section[1]/div/div/div[2]/ul/li[1]/div/span/a/span/text()').getall()
date = re.sub(r'^.*?, ', '', date)
promotors = ', '.join(promotors)
if len(date) == 4:
date = f'99-99-{date}'
elif len(date) >= 15:
date = date[5:]
elif date[-4: -2] == '20':
date = datetime.strptime(date, '%b %d, %Y').strftime('%d-%m-%Y')
else:
date = datetime.strptime(date, '%d %b').strftime('%d-%m') + '-2023'
country = country.split('/')[-2].upper()
acts = ', '.join(acts)
item = {
'date': date,
'Event': event,
'promotors': promotors,
'Location': location,
'Country': country,
'Venue': venue,
'Acts': acts
}
yield item
if __name__== '__main__':
process = CrawlerProcess(get_project_settings())
process.crawl(PostsSpider)
process.start()
my flask app:
import subprocess
import crochet
crochet.setup()
from flask import Flask , render_template, jsonify, request, redirect, url_for
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.signalmanager import dispatcher
import time
from ScrapyProject.scraperra.scraperra.spiders.artistspider import PostsSpider
# Creating Flask App Variable
app = Flask(__name__)
output_data = []
crawl_runner = CrawlerRunner()
# By Deafult Flask will come into this when we run the file
#app.route('/')
def index():
return render_template("index.html") # Returns index.html file in templates folder.
# After clicking the Submit Button FLASK will come into this
#app.route('/', methods=['POST'])
def submit():
if request.method == 'POST':
return redirect(url_for('scrape')) # Passing to the Scrape function
#app.route("/scrape")
def scrape():
process = subprocess.Popen('python ScrapyProject/scraperra/scraperra/spiders/artistspider.py', shell=True)
process.wait()
return jsonify(output_data) # Returns the scraped data after being running for 20 seconds.
if __name__ == "__main__":
app.run(debug=True)

Related

Upload a file from Flutter, process it in rest API and then download send it to Flutter

im trying to create a communication between flutter and a flask rest API. I wanted to send a file from Flutter(epub file) to the API, then process it to txt and finally re-send it to flutter. The problem is that i don't want to create the html box for the uploading part of the file, for ux and ui of the app, so making it easier for the user.
In the server part i use some epub files example in local, so that from chrome i do a get request and all works fine, but i dont know how to upload a file from Flutter app to Flask.
This is the Flask part =
from flask import Flask,send_file,send_from_directory, flash, request, redirect,
url_for
from werkzeug.utils import secure_filename
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import os
#//////////////////////////////----epub to txt------///////////////////////////////////
def epub2thtml(epub_path):
book = epub.read_epub(epub_path)
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
print(chapters)
return chapters
blacklist = [ '[document]', 'noscript', 'header', 'html', 'meta', 'head','input', 'script', ]
def chap2text(chap):
output = ''
soup = BeautifulSoup(chap, 'html.parser')
text = soup.find_all(text=True)
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return output
def thtml2ttext(thtml):
Output = []
for html in thtml:
text = chap2text(html)
Output.append(text)
return Output
def epub2text(epub_path , nomefile):
chapters = epub2thtml(epub_path)
ttext = thtml2ttext(chapters)
#print (ttext)
a = 0;
s = ' ';
with open(nomefile, 'w') as f:
b = s.join(ttext)
testo = b.split()
for i in testo:
f.write(testo[a])
f.write('\n')
a = a + 1;
print(testo)
f.close()
#//////////////////////////////----server------///////////////////////////////////
app = Flask(__name__)
app.config["CLIENT_EPUB"] = "...../libri"
#app.route('/epub/<string:epubfilename>',methods = ['GET','POST'])
def get_txt(epubfilename):
nometxt = str(os.path.splitext(epubfilename)) + ".txt"
epub2text(epubfilename, nometxt)
try:
return send_from_directory(app.config["CLIENT_EPUB"], nometxt, as_attachment=True)
except FileNotFoundError:
abort(404)
if __name__ == '__main__':
app.run(debug = True, port=2500)
For the flutter part i wanted to use a multipart request to do some of these requests, an example=
void sendRequest() {
var request = MultipartRequest();
final imagePath = ('assets/testo.txt');
request.setUrl(...url);
request.addFile("text", imagePath);
Response response = request.send();
response.onError = () {
print("Error");
};
response.onComplete = (response) {
print(response);
};
response.progress.listen((int progress) {
print("progress from response object " + progress.toString());
});
}
How can i upload the file from the app to the api without the form and resend it? Thanks in advance, i think it isnt that difficult but i cant!

duplicated data scraper json api

I have this script:
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line:
for item in range(0,576,32):
Thanks in advance
You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not.
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
#'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Proven by set()
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
unique_data = set()
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
s=result['productReference']
self.unique_data.add(s)
yield {
#'Casa':'Just_For_Sports',
'Sku' :s,
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' + result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
Output:
'item_scraped_count': 576,

Crawled (403) Error while login to glassdoor.com using scrapy in python. Need Solution?

Here is the complete code, there is error "Crawled (403)", when I run the code. If I bypass the HTTP error by using HTTPERROR_ALLOWED_CODES =[403] in setting.py, then code start working.
But I need the solution of login into the website.
import scrapy
from urllib.parse import urljoin
from scrapy.http import Request,FormRequest
class MoorSpider(scrapy.Spider):
name = 'moor'
allowed_domains = ['glassdoor.com']
start_urls = ['https://www.glassdoor.com/profile/login_input.htm']
page_number = 2
def parse(self,response):
token = response.xpath('.//*[#name="gdToken"]/#value').extract()
# print(token)
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)
def startscraper(self,response):
yield Request('https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=1&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US', callback=self.startscraper1)
def startscraper1(self,response):
urls = response.css('.col-12.my-0.mt-sm.mt-sm-std.order-5 a::attr(href)').extract()
# print(next_page)
for url in urls:
url1 = urljoin('https://www.glassdoor.com/', url)
yield Request(url1, callback=self.DetailPage)
# next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page='+str(MoorSpider.page_number)+'&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=' + str(
MoorSpider.page_number) + '&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
if MoorSpider.page_number <= 2:
MoorSpider.page_number += 1
yield response.follow(next_page, callback=self.startscraper1)
def DetailPage(self,response):
Company_Website=response.css('[data-test="employer-website"]::text').get()
Company_Revenue = response.css('[data-test="employer-revenue"]::text').get()
Company_Description = response.css('span[data-test="employerDescription"]::text').get()
Company_Mission = response.css('span[data-test="employerMission"]::text').get()
yield {
'Company_Website':Company_Website,
'Company_Revenue':Company_Revenue,
'Company_Description':Company_Description,
'Company_Mission':Company_Mission,
}
Replace your existing parse method with the following one in order for it to work. Turn out that your token variable holds nothing as it is generated dynamically. You can however parse the value of gdToken out of some script tag.
def parse(self,response):
token = response.css('body').re(r"gdToken\":\"(.*?)\",")[0]
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)

How to navigate through js/ajax based pagination while scraping a website?

My code works fine only for the first page of each category, But I want to scrape from all the pages of each category. I'm not able to navigate through the next pages. The website uses AJAX for populating the data when I click on next button for navigating to next page.
I have also looked into the ajax request which is being made by this website for dynamically populating data(This is the URL which pop up on network tab when I clicked on next page button https://www.couponcodesme.com/ae/category/searchfilter). But didn't find any way to mock that request manually using Scrapy.
If it's possible to mock the ajax request please let me know how to do it for this particular problem.
You are welcome to suggest any other solution rather than Scrapy-Splash!
I have searched the whole Stack Overflow forum but didn't find a proper solution for this problem.
Please look into this and help me.
Thank You
import scrapy
from scrapy import Request
from ..items import CouponcollectItem
from scrapy_splash import SplashRequest
class Couponsite5SpiderSpider(scrapy.Spider):
name = 'couponSite5_spider'
allowed_domains = ['www.couponcodesme.com']
script = '''
function main(splash, args)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(5))
assert(splash:runjs("$('a.category_pagination_btn.next_btn.top-page-button').click()"))
assert(splash:wait(5))
return {
html = splash:html()
}
end
'''
def start_requests(self):
yield Request(
url="https://www.couponcodesme.com/ae/categories",
callback=self.parse
)
def parse(self, response):
urls = response.xpath('//ul[#class="flexboxesmain categorieslist"]/li/a/#href').extract()
for url in urls:
yield SplashRequest(
url=url,
callback=self.parse_params,
endpoint="execute",
args={
'wait': 1,
'lua_source': self.script
}
)
def parse_params(self, response):
items = CouponcollectItem()
coupon_category = response.xpath('//div[#class="head_main"]/h1[#class="h2_title"]/text()').extract()
coupon_lists = response.css('#temp1')
for coupon in coupon_lists.xpath('div'):
coupon_title = coupon.xpath('normalize-space(.//h3/a/text())').extract()
coupon_store_name = coupon.xpath('normalize-space(.//div[#class="img-vert-center setheight brdrclr"]/a/#href)').extract()
store_img_src = coupon.xpath('normalize-space(.//div[#class="img-vert-center setheight brdrclr"]/a/img/#data-src)').extract()
coupon_code_txt = coupon.xpath('normalize-space(.//span[#class="offer_code"]/span/text())').extract()
coupon_store_out = coupon.xpath('.//button/#data-alt').extract()
items['coupon_title'] = [self.deEmojify(coupon_title[0]) if len(coupon_title) != 0 else '']
items['coupon_code_txt'] = [coupon_code_txt[0] if len(coupon_code_txt) != 0 else '']
items['coupon_store_out'] = [coupon_store_out[0] if len(coupon_store_out) != 0 else '']
items['store_img_src'] = [store_img_src[0] if len(store_img_src) != 0 else '']
items['website_link'] = [response.request.url]
if len(coupon_category) != 0:
if coupon_category[0].endswith(' Coupons'):
items['coupon_category'] = [self.deEmojify(coupon_category[0][:-8])]
else:
items['coupon_category'] = [self.deEmojify(coupon_category[0])]
else:
items['coupon_category'] = ['']
if len(coupon_store_name) != 0:
if coupon_store_name[0].endswith(' Coupons'):
items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0][:-8])]
elif coupon_store_name[0].startswith('https://'):
items['coupon_store_name'] = [coupon_store_name[0].split('/')[-1]]
else:
items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0])]
else:
items['coupon_store_name'] = ['']
yield items
def deEmojify(self, inputString):
return inputString.encode('ascii', 'ignore').decode('ascii')

Iterate over all links/sub-links with Scrapy run from script

I want to run Scrapy Spider from my script, but it works only for 1 request. I cannot execute the procedure self.parse_product from scrapy.http.Request(product_url, callback=self.parse_product).
I guess it's being due the command crawler.signals.connect(callback, signal=signals.spider_closed). Please advise how correctly go over all links and sub-links.
Whole script is shown below.
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[#class="extension-grid"]'):
for product_block in meta.xpath('//div[#class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[#class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[#class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/#href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[#class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
Your spider is being blocked from visiting pages after the start page by your allowed_domains specification. The value should include just the domain, not the protocol. Try
allowed_domains = ["www.WebStore.com"]
Also the line desc_out = Join() in your WebStoreItemLoader definition may give an error as you have no desc field.

Categories

Resources