how to use scrapy package with Juypter Notebook - python

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 2 days ago.
Improve this question
I'm trying to learn web scraping/crawling and trying to apply the below code on Juypter Notebook but it didn't show any outputs, So can anyone help me and guide me to how to use the scrapy package on Juypter notebook.
The code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BooksCrawlSpider(CrawlSpider):
name = 'books_crawl'
allowed_domains = ['books.toscrape.com']
start_urls = ['https://books.toscrape.com/catalogue/category/books/sequential-art_5/page-1.html']
le_book_details = LinkExtractor(restrict_css='h3 > a')
le_next = LinkExtractor(restrict_css='.next > a') # next_button
le_cats = LinkExtractor(restrict_css='.side_categories > ul > li > ul > li a') # Categories
rule_book_details = Rule(le_book_details, callback='parse_item', follow=False)
rule_next = Rule(le_next, follow=True)
rule_cats = Rule(le_cats, follow=True)
rules = (
rule_book_details,
rule_next,
rule_cats
)
def parse_item(self, response):
yield {
'Title': response.css('h1 ::text').get(),
'Category': response.xpath('//ul[#class="breadcrumb"]/li[last()-1]/a/text()').get(),
'Link': response.url
}
The final result is without any output:-

To run your spider you can add the following snippet in a new cell:
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
process.crawl(BooksCrawlSpider)
process.start()
More details on the Scrapy docs
Edit:
A solution to create a dataframe from the extracted items would be first exporting the output to a file (eg. .CSV), by passing the settings parameter to CrawlerProcess:
process = CrawlerProcess(settings={
"FEEDS": {
"items.csv": {"format": "csv"},
},
})
Then open it with pandas:
df = pd.read_csv("items.csv")

Related

Scrapy one item with multiple parsing functions

I am using Scrapy with python to scrape a website and I face some difficulties with filling the item that I have created.
The products are properly scraped and everything is working well as long as the info is located within the response.xpath mentioned in the for loop.
'trend' and 'number' are properly added to the Item using ItemLoader.
However, the date of the product is not located within the response.xpath cited below but in the response.css as a title : response.css('title')
import scrapy
import datetime
from trends.items import Trend_item
from scrapy.loader import ItemLoader
#Initiate the spider
class trendspiders(scrapy.Spider):
name = 'milk'
start_urls = ['https://thewebsiteforthebestmilk/ireland/2022-03-16/7/']
def parse(self, response):
for milk_unique in response.xpath('/html/body/main/div/div[2]/div[1]/section[1]/div/div[3]/table/tbody/tr'):
l = ItemLoader(item=Milk_item(), selector=milk_unique, response=response)
l.add_css('milk', 'a::text')
l.add_css('number', 'span.small.text-muted::text')
return l.load_item()
How can I add the 'date' to my item please (found in response.css('title')?
I have tried to add l.add_css('date', "response.css('title')")in the for loop but it returns an error.
Should I create a new parsing function? If yes then how to send the info to the same Item?
I hope I’ve made myself clear.
Thank you very much for your help,
Since the date is outside of the selector you are using for each row, what you should do is extract that first before your for loop, since it doesn't need to be updated on each iteration.
Then with your item loader you can just use l.add_value to load it with the rest of the fields.
For example:
class trendspiders(scrapy.Spider):
name = 'trends'
start_urls = ['https://getdaytrends.com/ireland/2022-03-16/7/']
def parse(self, response):
date_str = response.xpath("//title/text()").get()
for trend_unique in response.xpath('/html/body/main/div/div[2]/div[1]/section[1]/div/div[3]/table/tbody/tr'):
l = ItemLoader(item=Trend_item(), selector=trend_unique, response=response)
l.add_css('trend', 'a::text')
l.add_css('number', 'span.small.text-muted::text')
l.add_value('date', date_str)
yield l.load_item()
If response.css('title').get() gives you the answer you need, why not use the same CSS with add_css:
l.add_css('date', 'title')
Also, .add_css('date', "response.css('title')") is invalid because the second argument a valid CSS selector.

Python & Scrapy output: "\r\n\t\t\t\t\t\t\t"

I'M learning scraping with Scrapy and having some issues with some code giving me a weird output that I don't understand. Can someone explain to me why I am getting a bunch "\r\n\t\t\t\t\t\t\t"
I found this solution on Stack overflow:
Remove an '\\n\\t\\t\\t'-element from list
But I want to learn what is causing it.
Here is my code that is causing my issue. The Strip method from the link above solves it, but as mentioned, I don't understand where it is coming from.
import scrapy
import logging
import re
class CitySpider(scrapy.Spider):
name = 'city'
allowed_domains = ['www.a-tembo.nl']
start_urls = ['https://www.a-tembo.nl/themas/category/city/']
def parse(self, response):
titles = response.xpath("//div[#class='hikashop_category_image']/a")
for title in titles:
series = title.xpath(".//#title").get()
link = title.xpath(".//#href").get()
#absolute_url = f"https://www.a-tembo.nl{link}"
#absolute_url = response.urljoin(link)
yield response.follow(link, callback=self.parse_title)
def parse_title(self, response):
rows = response.xpath("//table[#class='hikashop_products_table adminlist table']/tbody/tr")
for row in rows:
product_code = row.xpath(".//span[#class='hikashop_product_code']/text()").get()
product_name = row.xpath(".//span[#class='hikashop_product_name']/a/text()").get()
yield{
"Product_code": product_code,
"Product_name": product_name
}
Characters like \n are called escape characters.
For example:
\n indicates a new line and \t signifies a tab. Websites are full of them, although you never see them without inspecting the page. If you want to learn more about escape characters in Python you can read about them here. I hope that answers your question.

Scrapy only scrapes the first start url in a list of 15 start urls

I am new to Scrapy and am attempting to teach myself the basics. I have compiled a code that goes to the Louisiana Department of Natural Resources website to retrieve the serial number for certain oil wells.
I have each well's link listed in the start URLs command, but scrappy only downloads data from the first url. What am I doing wrong?
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from mike.items import MikeItem
class SonrisSpider(Spider):
name = "sspider"
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def parse(self, response):
item = MikeItem()
item['serial'] = response.xpath('/html/body/table[1]/tr[2]/td[1]/text()').extract()[0]
yield item
Thank you for any help you might be able to provide. If I have not explained my problem thoroughly, please let me know and I will attempt to clarify.
I think this code might help,
By default scrapy prevent duplicate requests. Since only the parameters are different in your start-url scrapy will consider the rest of the urls in the start-url as duplicate request of the first one. That's why your spider stops after fetching the first url. In order to parse the rest of the urls we have enable dont_filter flag in the scrapy request. (chek the start_request())
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from mike.items import MikeItem
class SonrisSpider(scrapy.Spider):
name = "sspider"
allowed_domains = ["sonlite.dnr.state.la.us"]
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
item = MikeItem()
serial = response.xpath(
'/html/body/table[1]/tr[2]/td[1]/text()').extract()
serial = serial[0] if serial else 'n/a'
item['serial'] = serial
yield item
sample output returned by this spider is as follows,
{'serial': u'207899'}
{'serial': u'971683'}
{'serial': u'214206'}
{'serial': u'159420'}
{'serial': u'248942'}
{'serial': u'243671'}
your code sounds good, try to add this function
class SonrisSpider(Spider):
def start_requests(self):
for url in self.start_urls:
print(url)
yield self.make_requests_from_url(url)
#the result of your code goes here
The URLs should be printed now. Test it, if not, say please

Stop Scrapy crawling the same URLs

I've written a basic Scrapy spider to crawl a website which seems to run fine other than the fact it doesn't want to stop, i.e. it keeps revisiting the same urls and returning the same content - I always end up having to stop it. I suspect it's going over the same urls over and over again. Is there a rule that will stop this? Or is there something else I have to do? Maybe middleware?
The Spider is as below:
class LsbuSpider(CrawlSpider):
name = "lsbu6"
allowed_domains = ["lsbu.ac.uk"]
start_urls = [
"http://www.lsbu.ac.uk"
]
rules = [
Rule(SgmlLinkExtractor(allow=['lsbu.ac.uk/business-and-partners/.+']), callback='parse_item', follow=True),
]
def parse_item(self, response):
join = Join()
sel = Selector(response)
bits = sel.xpath('//*')
scraped_bits = []
for bit in bits:
scraped_bit = LsbuItem()
scraped_bit['title'] = scraped_bit.xpath('//title/text()').extract()
scraped_bit['desc'] = join(bit.xpath('//*[#id="main_content_main_column"]//text()').extract()).strip()
scraped_bits.append(scraped_bit)
return scraped_bits
My settings.py file looks like this
BOT_NAME = 'lsbu6'
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = True
SPIDER_MODULES = ['lsbu.spiders']
NEWSPIDER_MODULE = 'lsbu.spiders'
Any help/ guidance/ instruction on stopping it running continuously would be greatly appreciated...
As I'm a newbie to this; any comments on tidying the code up would also be helpful (or links to good instruction).
Thanks...
The DupeFilter is enabled by default: http://doc.scrapy.org/en/latest/topics/settings.html#dupefilter-class and it's based on the request url.
I tried a simplified version of your spider on a new vanilla scrapy project without any custom configuration. The dupefilter worked and the crawl stopped after a few requests. I'd say you have something wrong on your settings or on your scrapy version. I'd suggest you to upgrade to scrapy 1.0, just to be sure :)
$ pip install scrapy --pre
The simplified spider I tested:
from scrapy.spiders import CrawlSpider
from scrapy.linkextractors import LinkExtractor
from scrapy import Item, Field
from scrapy.spiders import Rule
class LsbuItem(Item):
title = Field()
url = Field()
class LsbuSpider(CrawlSpider):
name = "lsbu6"
allowed_domains = ["lsbu.ac.uk"]
start_urls = [
"http://www.lsbu.ac.uk"
]
rules = [
Rule(LinkExtractor(allow=['lsbu.ac.uk/business-and-partners/.+']), callback='parse_item', follow=True),
]
def parse_item(self, response):
scraped_bit = LsbuItem()
scraped_bit['url'] = response.url
yield scraped_bit
Your design makes the crawl go in circles. For examples, there is a page http://www.lsbu.ac.uk/business-and-partners/business, which when opened contains the link to "http://www.lsbu.ac.uk/business-and-partners/partners, and that one contains again the link to the first one. Thus, you go in circles indefinitely.
In order to overcome this, you need to create better rules, eliminating the circular references.
And also, you have two identical rules defined, which is not needed. If you want the follow you can always put it on the same rule, you don't need a new rule.

Scrapy successful but won't output any information?

I've made a lot of headway with this spider- am just growing accustomed to coding and am enjoying every minute of it. However, as I'm learning the majority of my programming is problem solving. Here's my current error:
My spider shows all of the data I want in the terminal window. When I go to output, nothing shows up. Here is my code.
import re
import json
from urlparse import urlparse
from scrapy.selector import Selector
try:
from scrapy.spider import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from database.items import databaseItem
from scrapy.log import *
class CommonSpider(CrawlSpider):
name = 'fenders.py'
allowed_domains = ['usedprice.com']
start_urls = ['http://www.usedprice.com/items/guitars-musical-instruments/fender/?ob=model_asc#results']
rules = (
Rule(LinkExtractor(allow=( )), callback='parse_item'),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = []
data = hxs.select('//tr[#class="oddItemColor baseText"]')
tmpNextPage = hxs.select('//div[#class="baseText blue"]/span[#id="pnLink"]/a/#href').extract()
for attr in data:
#item = RowItem()
instrInfo = attr.select('//td[#class="itemResult"]/text()').extract()
print "Instrument Info: ", instrInfo
yield instrInfo
As JoeLinux said, you're yielding a string, instead of returning the item. If you're mostly working off the tutorial, you probably have an "items.py" file someplace (maybe some other name), where you item is defined - it would appear that it's called "RowItem()". Here you've got several fields, or maybe just one.
What you need to do is figure out how you want to store the data in the item. So, making a gross assumption, you probably want RowItem() to include a field called instrInfo. So your items.py file might include something like this:
class RowItem(scrapy.Item):
instrInfo = scrapy.Field()
Then your spider should include something like:
item = RowItem()
data = data = hxs.select('//tr[#class="oddItemColor baseText"]')
for attr in data:
instrInfo = attr.select('//td[#class="itemResult"]/text()').extract()
item['instrInfo'].append = instrInfo
return item
This will send the item off to your pipeline for processing.
As I said, some gross assumptions about what you're trying to do, and the format of your information, but hopefully this gets you started.
Separately, the print function probably isn't necessary. When the item is returned, it's displayed in the terminal (or log) as the spider runs.
Good luck!

Categories

Resources