start_request on scrpy.spyder does not seem to work fine - python

I hope you can give me some hints with my problem here.
I'm tryng to obtain an ordered data from a txt source. The code works fine till I print the data from the txt source, so it reads it. But onces I start a loop, reading each line from the txt file spydering it, and I "print(origdato)" to check if its working fine, but it does not.
Maybe is the loop, maybe is the request from spyder, I really dont know.
Could you please help me?
Here the code:
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import datetime
# scraper class
class myfile(scrapy.Spider):
# scraper name
name= 'whatever'
base_url = 'https://www.whatever.com/'
headers = {'...'
}
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
}
current_page = 2
origdatos= []
def __init__(self):
content = ''
with open('origdatos.txt', 'r') as f:
for line in f.read():
content += line
# parse content
self.origdatos= content.split('\n')
# print(self.origdatos) // Till heree works fine
# crawler
def start_requests(self):
self.current_page = 2
# loop over datos
for origdato in self.origdatos:
print(origdato) #In this print Python does not show me data, so it appears the loop does not work properly, maybe
#driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(myfile)
process.start()

Maybe this is a formatting issue with your code. If it is formatted as displayed in your question that is. Try unindenting the start_requestsmethod in your code and see if it fixes the problem.
The following should work as well:
import scrapy
from scrapy.crawler import CrawlerProcess
class myfile(scrapy.Spider):
name = 'whatever'
def __init__(self):
with open('origdatos.txt', 'r') as f:
self.origdatos = f.readlines()
def start_requests(self):
for origdato in self.origdatos:
print(origdato)
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(myfile)
process.start()
However, this will still produce an error at the end of execution, because start_requests is supposed to return an iterable.

Related

Scrapy Splash cannot get the data of a React site

I need to scrape this site.
Is made in React so it looks. Then I tried to extract the data with scrapy-splash. I need for example the "a" element with class shelf-product-name. But the response is an empty array. I used the wait argument in about 5 seconds.
But I only get an empty array.
def start_requests(self):
yield SplashRequest(
url='https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6',
callback=self.parse,
args={'wait':5}
)
def parse(self,response):
print(response.css("a.shelf-product-name"))
Actually there is no need to use Scrapy Splash because all required data stored inside <script> tag of raw html response as json formatted data:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class JumboCLSpider(scrapy.Spider):
name = "JumboCl"
start_urls = ["https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6"]
def parse(self,response):
script = [script for script in response.css("script::text") if "window.__renderData" in script.extract()]
if script:
script = script[0]
data = script.extract().split("window.__renderData = ")[-1]
json_data = json.loads(data[:-1])
for plp in json_data["plp"]["plp_products"]:
for product in plp["data"]:
#yield {"productName":product["productName"]} # data from css: a.shelf-product-name
yield product
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(JumboCLSpider)
c.start()

Cannot count the successful requests of web scraped pages using Python

I want to log for a start how many successful requests with status 200 I have after I complete the web scraping of a page I use the following part
import requests
import csv
import selenium
from selenium import webdriver
import time
from time import sleep
import datetime
mycount = 0
class Parser(object):
ses = requests.Session()
# parse a single item to get information
def parse(self, urls):
url = urls[1]
try:
r = self.ses.get(url)
time.sleep(3)
if r.status_code == 200:
mycount=mycount+1
and later one when I have mycount to pass it to a list and a csv
if __name__ == "__main__":
with Pool(4) as p:
print('Just before parsing..Page')
records = p.map(parser.parse, web_links)
with open(my_log_path,'a',encoding='utf-8',newline='') as logf:
writer = csv.writer(logf,delimiter=';')
writer.writerow(logs)
But I get that my local variable is referenced before assignment
Why mycount is treated as local variable if it is on the top and outside a function? How can I fix this?
thank you
Your class does not have access to mycount because it's a global variable. You should use global inside your class before modifying it:
def parse(self, urls):
global mycount
url = urls[1]

mess word when used scrapy to save json data to csv

I used scrapy to crawl comment data from the website http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0, which language is chinese. But i just got the output like this.
And the csv file's output is all messed up.
I don't know what happened. First i thought that was a json decode or json encode problem, then i tried the ways on the internet but i got the same result. Here's my code:
#!/usr/bin/env python
# encoding: utf-8
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from jd_comment.items import JdCommentItem
import json
class JdSpider(scrapy.Spider):
name = 'comment'
def start_requests(self):
url = 'http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
for comment in jsonresponse['comments']:
item = JdCommentItem()
item['username'] = comment['nickname']
item['user_ID'] = comment['id']
item['time'] = comment['referenceTime']
item['good_ID'] = comment['referenceId']
item['good_name'] = comment['referenceName']
item['content'] = comment['content']
item['score'] = comment['score']
items.append(item)
yield item
Anyone give me a hint would be highly appreciated. Thanks.

Confused about running Scrapy from within a Python script

Following document, I can run scrapy from a Python script, but I can't get the scrapy result.
This is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from items import DmozItem
class DmozSpider(BaseSpider):
name = "douban"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/group/xxx/discussion"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select("//table[#class='olt']/tr/td[#class='title']/a")
items = []
# print sites
for row in rows:
item = DmozItem()
item["title"] = row.select('text()').extract()[0]
item["link"] = row.select('#href').extract()[0]
items.append(item)
return items
Notice the last line, I try to use the returned parse result, if I run:
scrapy crawl douban
the terminal could print the return result
But I can't get the return result from the Python script. Here is my Python script:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy import log, signals
from spiders.dmoz_spider import DmozSpider
from scrapy.xlib.pydispatch import dispatcher
def stop_reactor():
reactor.stop()
dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = DmozSpider(domain='www.douban.com')
crawler = Crawler(Settings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
log.msg("------------>Running reactor")
result = reactor.run()
print result
log.msg("------------>Running stoped")
I try to get the result at the reactor.run(), but it return nothing,
How can I get the result?
Terminal prints the result because the default log level is set to DEBUG.
When you are running your spider from the script and call log.start(), the default log level is set to INFO.
Just replace:
log.start()
with
log.start(loglevel=log.DEBUG)
UPD:
To get the result as string, you can log everything to a file and then read from it, e.g.:
log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
with open("results.log", "r") as f:
result = f.read()
print result
Hope that helps.
I found your question while asking myself the same thing, namely: "How can I get the result?". Since this wasn't answered here I endeavoured to find the answer myself and now that I have I can share it:
items = []
def add_item(item):
items.append(item)
dispatcher.connect(add_item, signal=signals.item_passed)
Or for scrapy 0.22 (http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script) replace the last line of my solution by:
crawler.signals.connect(add_item, signals.item_passed)
My solution is freely adapted from http://www.tryolabs.com/Blog/2011/09/27/calling-scrapy-python-script/.
in my case, i placed the script file at scrapy project level e.g. if scrapyproject/scrapyproject/spiders then i placed it at scrapyproject/myscript.py

How to avoid duplication in a crawler

I wrote a crawler using the scrapy framework in python to select some links and meta tags.It then crawls the start urls and write the data in a JSON encoded format onto a file.The problem is that when the crawler is run two or three times with the same start urls the data in the file gets duplicated .To avoid this I used a downloader middleware in scrapy which is this : http://snippets.scrapy.org/snippets/1/
What I did was copy and paste the above code in a file inside my scrapy project and I enabled it in the settings.py file by adding the following line:
SPIDER_MIDDLEWARES = {'a11ypi.removeDuplicates.IgnoreVisitedItems':560}
where "a11ypi.removeDuplicates.IgnoreVisitedItems" is the class path name and finally I went in and modified my items.py file and included the following fields
visit_id = Field()
visit_status = Field()
But this doesn't work and still the crawler produces the same result appending it to the file when run twice
I did the writing to the file in my pipelines.py file as follows:
import json
class AYpiPipeline(object):
def __init__(self):
self.file = open("a11ypi_dict.json","ab+")
# this method is called to process an item after it has been scraped.
def process_item(self, item, spider):
d = {}
i = 0
# Here we are iterating over the scraped items and creating a dictionary of dictionaries.
try:
while i<len(item["foruri"]):
d.setdefault(item["foruri"][i],{}).setdefault(item["rec"][i],{})[item["foruri_id"][i]] = item['thisurl'] + ":" +item["thisid"][i]
i+=1
except IndexError:
print "Index out of range"
json.dump(d,self.file)
return item
And my spider code is as follows:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from a11ypi.items import AYpiItem
class AYpiSpider(CrawlSpider):
name = "a11y.in"
allowed_domains = ["a11y.in"]
# This is the list of seed URLs to begin crawling with.
start_urls = ["http://www.a11y.in/a11ypi/idea/fire-hi.html"]
# This is the callback method, which is used for scraping specific data
def parse(self,response):
temp = []
hxs = HtmlXPathSelector(response)
item = AYpiItem()
wholeforuri = hxs.select("//#foruri").extract() # XPath to extract the foruri, which contains both the URL and id in foruri
for i in wholeforuri:
temp.append(i.rpartition(":"))
item["foruri"] = [i[0] for i in temp] # This contains the URL in foruri
item["foruri_id"] = [i.split(":")[-1] for i in wholeforuri] # This contains the id in foruri
item['thisurl'] = response.url
item["thisid"] = hxs.select("//#foruri/../#id").extract()
item["rec"] = hxs.select("//#foruri/../#rec").extract()
return item
Kindly suggest what to do.
try to understand why the snippet is written as it is:
if isinstance(x, Request):
if self.FILTER_VISITED in x.meta:
visit_id = self._visited_id(x)
if visit_id in visited_ids:
log.msg("Ignoring already visited: %s" % x.url,
level=log.INFO, spider=spider)
visited = True
Notice in line 2, you actually require a key in in Request.meta called FILTER_VISITED in order for the middleware to drop the request. The reason is well-intended because every single url you have visited will be skipped and you will not have urls to tranverse at all if you do not do so. So, FILTER_VISITED actually allows you to choose what url patterns you want to skip. If you want links extracted with a particular rule skipped, just do
Rule(SgmlLinkExtractor(allow=('url_regex1', 'url_regex2' )), callback='my_callback', process_request = setVisitFilter)
def setVisitFilter(request):
request.meta['filter_visited'] = True
return request
P.S I do not know if it works for 0.14 and above as some of the code has changed for storing spider context in the sqlite db.

Categories

Resources