I want to log for a start how many successful requests with status 200 I have after I complete the web scraping of a page I use the following part
import requests
import csv
import selenium
from selenium import webdriver
import time
from time import sleep
import datetime
mycount = 0
class Parser(object):
ses = requests.Session()
# parse a single item to get information
def parse(self, urls):
url = urls[1]
try:
r = self.ses.get(url)
time.sleep(3)
if r.status_code == 200:
mycount=mycount+1
and later one when I have mycount to pass it to a list and a csv
if __name__ == "__main__":
with Pool(4) as p:
print('Just before parsing..Page')
records = p.map(parser.parse, web_links)
with open(my_log_path,'a',encoding='utf-8',newline='') as logf:
writer = csv.writer(logf,delimiter=';')
writer.writerow(logs)
But I get that my local variable is referenced before assignment
Why mycount is treated as local variable if it is on the top and outside a function? How can I fix this?
thank you
Your class does not have access to mycount because it's a global variable. You should use global inside your class before modifying it:
def parse(self, urls):
global mycount
url = urls[1]
Related
I hope you can give me some hints with my problem here.
I'm tryng to obtain an ordered data from a txt source. The code works fine till I print the data from the txt source, so it reads it. But onces I start a loop, reading each line from the txt file spydering it, and I "print(origdato)" to check if its working fine, but it does not.
Maybe is the loop, maybe is the request from spyder, I really dont know.
Could you please help me?
Here the code:
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import datetime
# scraper class
class myfile(scrapy.Spider):
# scraper name
name= 'whatever'
base_url = 'https://www.whatever.com/'
headers = {'...'
}
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
}
current_page = 2
origdatos= []
def __init__(self):
content = ''
with open('origdatos.txt', 'r') as f:
for line in f.read():
content += line
# parse content
self.origdatos= content.split('\n')
# print(self.origdatos) // Till heree works fine
# crawler
def start_requests(self):
self.current_page = 2
# loop over datos
for origdato in self.origdatos:
print(origdato) #In this print Python does not show me data, so it appears the loop does not work properly, maybe
#driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(myfile)
process.start()
Maybe this is a formatting issue with your code. If it is formatted as displayed in your question that is. Try unindenting the start_requestsmethod in your code and see if it fixes the problem.
The following should work as well:
import scrapy
from scrapy.crawler import CrawlerProcess
class myfile(scrapy.Spider):
name = 'whatever'
def __init__(self):
with open('origdatos.txt', 'r') as f:
self.origdatos = f.readlines()
def start_requests(self):
for origdato in self.origdatos:
print(origdato)
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(myfile)
process.start()
However, this will still produce an error at the end of execution, because start_requests is supposed to return an iterable.
I need to scrape this site.
Is made in React so it looks. Then I tried to extract the data with scrapy-splash. I need for example the "a" element with class shelf-product-name. But the response is an empty array. I used the wait argument in about 5 seconds.
But I only get an empty array.
def start_requests(self):
yield SplashRequest(
url='https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6',
callback=self.parse,
args={'wait':5}
)
def parse(self,response):
print(response.css("a.shelf-product-name"))
Actually there is no need to use Scrapy Splash because all required data stored inside <script> tag of raw html response as json formatted data:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class JumboCLSpider(scrapy.Spider):
name = "JumboCl"
start_urls = ["https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6"]
def parse(self,response):
script = [script for script in response.css("script::text") if "window.__renderData" in script.extract()]
if script:
script = script[0]
data = script.extract().split("window.__renderData = ")[-1]
json_data = json.loads(data[:-1])
for plp in json_data["plp"]["plp_products"]:
for product in plp["data"]:
#yield {"productName":product["productName"]} # data from css: a.shelf-product-name
yield product
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(JumboCLSpider)
c.start()
I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')
Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains
this is a program input multiple urls calling url localhost:8888/api/v1/crawler
this program taking 1+hour to run its ok but it block other apis.
when it running other any api will not work till the existing api end so i want to run this program asynchronously so how can i achieve with the same program
#tornado.web.asynchronous
#gen.coroutine
#use_args(OrgTypeSchema)
def post(self, args):
print "Enter In Crawler Match Script POST"
print "Argsssss........"
print args
data = tornado.escape.json_decode(self.request.body)
print "Data................"
import json
print json.dumps(data.get('urls'))
from urllib import urlopen
from bs4 import BeautifulSoup
try:
urls = json.dumps(data.get('urls'));
urls = urls.split()
import sys
list = [];
# orig_stdout = sys.stdout
# f = open('out.txt', 'w')
# sys.stdout = f
for url in urls:
# print "FOFOFOFOFFOFO"
# print url
url = url.replace('"'," ")
url = url.replace('[', " ")
url = url.replace(']', " ")
url = url.replace(',', " ")
print "Final Url "
print url
try:
site = urlopen(url) ..............
Your post method is 100% synchronous. You should make the site = urlopen(url) async. There is an async HTTP client in Tornado for that. Also good example here.
You are using urllib which is the reason for blocking.
Tornado provides a non-blocking client called AsyncHTTPClient, which is what you should be using.
Use it like this:
from tornado.httpclient import AsyncHTTPClient
#gen.coroutine
#use_args(OrgTypeSchema)
def post(self, args):
...
http_client = AsyncHTTPClient()
site = yield http_client.fetch(url)
...
Another thing that I'd like to point out is don't import modules from inside a function. Although, it's not the reason for blocking but it is still slower than if you put all your imports at the top of file. Read this question.
I'm trying to print an updated value and store it in a CSV file. Im using threading and the print would be every 1 second, however after every second that ellapses its the same value that is printed. Can someone help?
import urllib.request, urllib.parse, urllib.error
import json
import threading
import time
localtime = time.asctime( time.localtime(time.time()))
url = 'api'
uh = urllib.request.urlopen(url)
data = uh.read().decode()
js =json.loads(data)
def last_price():
threading.Timer(1.0, last_price).start()
print(js['last'])
print(localtime)
last_price()
The variable js is currently evaluated only once. If you want to query the API every second, move the query code inside the function being executed by the timer:
url = 'api'
def last_price():
localtime = time.asctime( time.localtime(time.time()))
uh = urllib.request.urlopen(url)
data = uh.read().decode()
js = json.loads(data)
print(js['last'])
print(localtime)
threading.Timer(1.0, last_price).start()
last_price()