can't get an output from scrapy - python

when I put
scrapy runspider divar.py -o data.json
in the terminal, I get an empty file. am I doing something wrong here? I want to get result of categories and subcategories from the URL I put in the start_urls and then put in the result and the print it and also get an json file. mostly json file.
import scrapy
class ws(scrapy.Spider):
name = 'wsDivar'
result = []
start_urls =["https://divar.ir/s/tehran"]
def parse(self, response):
for category in response.xpath("//*/ul[#class='kt-accordion-item__header']"):
x = {'cats' : category.xpath("//*/ul[#class='kt-accordion-item__header']/a").extract_first()}
result.append(x)
yield(x)
print(result)
next_L =response.xpath("//li[#class='next']/a/#href").extract_first()
if next_L is not None:
next_link = response.urljoin(next_L)
yield scrapy.Request(url=next_link, callback=self.parse)

import scrapy
class ws(scrapy.Spider):
name = 'wsDivar'
start_urls =["https://divar.ir/s/tehran"]
def parse(self, response):
for category in response.xpath("//*/ul[#class='kt-accordion-item__header']"):
x = {'cats' : category.xpath("//*/ul[#class='kt-accordion-item__header']/a").extract_first()}
yield(x)
next_L =response.xpath("//li[#class='next']/a/#href").extract_first()
if next_L is not None:
next_link = response.urljoin(next_L)
yield scrapy.Request(url=next_link, callback=self.parse)
if your XPath works fine. yield also print you the result.
instead of this:
scrapy runspider divar.py -o data.json
use this:
scrapy crawl wsDivar -o data.json
Also, run the command in project directory which supposed to include scrapy.cfg file.

Related

How to run multiple spiders through individual pipelines?

Total noob just getting started with scrapy.
In my directory structure I have like this...
#FYI: running on Scrapy 2.4.1
WebScraper/
Webscraper/
spiders/
spider.py # (NOTE: contains spider1 and spider2 classes.)
items.py
middlewares.py
pipelines.py # (NOTE: contains spider1Pipeline and spider2Pipeline)
settings.py # (NOTE: I wrote here:
#ITEM_PIPELINES = {
# 'WebScraper.pipelines.spider1_pipelines': 300,
# 'WebScraper.pipelines.spider2_pipelines': 300,
#}
scrapy.cfg
And spider2.py resembles...
class OneSpider(scrapy.Spider):
name = "spider1"
def start_requests(self):
urls = ["url1.com",]
yield scrapy.Request(
url="http://url1.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
class TwoSpider(scrapy.Spider):
name = "spider2"
def start_requests(self):
urls = ["url2.com",]
yield scrapy.Request(
url="http://url2.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
With pipelines.py looking like...
class spider1_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider1.csv', 'w', newline=''))
self.csvwriter.writerow(['header1', 'header2'])
def process_item(self, item, spider):
row = []
row.append(item['header1'])
row.append(item['header2'])
self.csvwrite.writerow(row)
class spider2_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider2.csv', 'w', newline=''))
self.csvwriter.writerow(['header_a', 'header_b'])
def process_item(self, item, spider):
row = []
row.append(item['header_a']) #NOTE: this is not the same as header1
row.append(item['header_b']) #NOTE: this is not the same as header2
self.csvwrite.writerow(row)
I have a question about running spider1 and spider2 on different urls with one terminal command:
nohup scrapy crawl spider1 -o spider1_output.csv --logfile spider1.log & scrapy crawl spider2 -o spider2_output.csv --logfile spider2.log
Note: this is an extension of a previous question specific to this stack overflow post (2018).
Desired result: spider1.csv with data from spider1, spider2.csv with data from spider2.
Current result: spider1.csv with data from spider1, spider2.csv BREAKS but error log contains spider2 data, and that there was a keyerror ['header1'], even though the item for spider2 does not include header1, it only includes header_a.
Does anyone know how to run one spider after the other on different urls, and plug data fetched by spider1, spider2, etc. into pipelines specific to that spider, as in spider1 -> spider1Pipeline -> spider1.csv, spider2 -> spider2Pipelines -> spider2.csv.
Or perhaps this is a matter of specifying the spider1_item and spider2_item from items.py? I wonder if I can specify where to insert spider2's data that way.
Thank you!
You can implement this using custom_settings spider attribute to set settings individually per spider
#spider2.py
class OneSpider(scrapy.Spider):
name = "spider1"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider1_pipelines': 300}
...
class TwoSpider(scrapy.Spider):
name = "spider2"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider2_pipelines': 300}
...

Having problems with a scrapy-splash script. I only get one result and my scraper does not parse other pages

I am trying to parse a list from a javascript website. When I run it, it only gives me back one entry on each column and then the spider shuts down. I have already set up my middleware settings. I am not sure what is going wrong. Thanks in advance!
import scrapy
from scrapy_splash import SplashRequest
class MalrusSpider(scrapy.Spider):
name = 'malrus'
allowed_domains = ['backgroundscreeninginrussia.com']
start_urls = ['http://www.backgroundscreeninginrussia.com/publications/new-citizens-of-malta-since-january-2015-till-december-2017/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html')
def parse(self, response):
russians = response.xpath('//table[#id="tablepress-8"]')
for russian in russians:
yield{'name' : russian.xpath('//*[#class="column-1"]/text()').extract_first(),
'source' : russian.xpath('//*[#class="column-2"]/text()').extract_first()}
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("a[class=paginate_button next] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url=response.url,
callback=self.parse,
endpoint='execute',
args={'lua_source': script})
The .extract_first() (now .get()) you used will always return the first result. It's not an iterator so there is no sense to call it several times. You should try the .getall() method. That will be something like:
names = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-1"]/text()').getall()
sources = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-2"]/text()').getall()

Python: Scrapy "response.xpath" is not working

I have a Python 3 code using Scrapy. This is some part of the code:
import scrapy
...
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
]
def parse(self, response):
...
image_link = self.get_image_link(response)
try:
item = response.xpath("//*[#id='theid1']").extract_first()
except:
item = response.xpath("//*[#id='theid2']").extract_first()
...
If I run the code as normal, there will not be anything in the item, but if I put a breakpoint on this line:
image_link = self.get_image_link(response)
and run the code step by step then I have values in item.
Is there a timing issue I should be aware of? Is this related to asynchronous way Scrapy works? How can resolve this issue?

Scrapy CLI output - CSV_DELIMITER parameter not working

I am trying to run the scrapy exporter with a custom delimiter via CLI like this:
scrapy runspider beneficiari_2016.py -o beneficiari_2016.csv -t csv -a CSV_DELIMITER="\n"
The export works perfectly, but the delimiter is still the default comma(",").
Please let me know if you have any idea how it can be fixed. Thank you!
The code:
import scrapy
from scrapy.item import Item, Field
import urllib.parse
class anmdm(Item):
nume_beneficiar = Field()
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['http://www.anm.ro/sponsorizari/afisare-2016/beneficiari?
page=1']
def parse(self, response):
doctor = anmdm()
doctors = []
for item in response.xpath('//tbody/tr'):
doctor['nume_beneficiar'] =
item.xpath('td[5]//text()').extract_first()
yield doctor
next_page = response.xpath("//ul/li[#class='active']/following-
sibling::li/a/#href").extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
print(next_page)
yield response.follow(next_page, self.parse)
CSV_DELIMITER needs to be changed in settings and not like an spider argument -a.
To change settings on the command line use -s:
scrapy runspider beneficiari_2016.py -o beneficiari_2016.csv -t csv -s CSV_DELIMITER="\n"

Running Scrapy crawler from a main function

Ihave wriiten a crawler in scrapy but I would want to initiate the crwaling by using main method
import sys, getopt
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def __init__(self, *args):
try:
opts, args = getopt.getopt(args, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit(2)
super(MySpider, self).__init__(self,*args)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
yield item
Instead of using scrapy runspider Crawler.py arg1 arg2
I would like to have a seprate class with main function and initiate scrapy from there. How to this?
There are different ways to approach this, but I suggest the following:
Have a main.py file on the same directory that will open a new process and launch the spider with the parameters you need.
The main.py file would have something like the following:
import subprocess
scrapy_command = 'scrapy runspider {spider_name} -a param_1="{param_1}"'.format(spider_name='your_spider', param_1='your_value')
process = subprocess.Popen(scrapy_command, shell=True)
With this code, you just need to call your main file.
python main.py
Hope it helps.

Categories

Resources