I am using scrapy for scraping news headlines and I am a rookie for scrapy and scraping as a whole. I am having huge issues for a few days now pipelining my scraped data into my SQL db.
I have 2 classes in my pipelines.py file one for inserting items to Database and another for backing up scraped data into json file for front end web development reasons.
This is the code for my spider
- its extracting news headlines from the start_urls
- it picks up this data as strings using extract() and later on looping through all of them and using strip() to remove white spaces for better formatting
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from Aljazeera.items import AljazeeraItem
from datetime import date, datetime
class AljazeeraSpider(Spider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
"http://www.aljazeera.com/news/europe/",
"http://www.aljazeera.com/news/middleeast/",
"http://www.aljazeera.com/news/asia/",
"http://www.aljazeera.com/news/asia-pacific/",
"http://www.aljazeera.com/news/americas/",
"http://www.aljazeera.com/news/africa/",
"http://blogs.aljazeera.com/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[#valign="bottom"]')
contents = sel.xpath('//div[#class="indexSummaryText"]')
items = []
for site,content in zip(sites, contents):
item = AljazeeraItem()
item['headline'] = site.xpath('div[3]/text()').extract()
item['content'] = site.xpath('div/a/text()').extract()
item['date'] = str(date.today())
for headline, content in zip(item['content'], item['headline']):
item['headline'] = headline.strip()
item['content'] = content.strip()
items.append(item)
return items
The Code for my pipeline.py is as follows :
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
import json
import os.path
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
#log runs into back file
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('backDataOfScrapes.json', "w")
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write("item === " + line)
return item
And the settings.py is as follows :
BOT_NAME = 'Aljazeera'
SPIDER_MODULES = ['Aljazeera.spiders']
NEWSPIDER_MODULE = 'Aljazeera.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Aljazeera (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'Aljazeera.pipelines.JsonWriterPipeline': 300,
'Aljazeera.pipelines.SQLStore': 300,
}
My sql setting are all ok. and after running scrapy crawl aljazeera it works and even outputs the items in json format as follows :
item === {"headline": "Turkey court says Twitter ban violates rights", "content": "Although ruling by Turkey's highest court is binding, it is unclear whether the government will overturn the ban.", "date": "2014-04-02"}
i really dont know or cant see what I am missing here. I would really appreciate if u guys could help me out.
Thanks for your time,
Your indentation is wrong in the SQLStore pipeline. I've tested with correct indentation and its working fine. Copy the below and it should be perfect.
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Related
I have been trying to scrape a news site to store each article in mySQL database. My goal is to store the following data for each article on the news site: date, title, summary, link
I been trying different methods and decided after trying for a few weeks to come here on stackoverflow to get a solution to my problem. (Note: I have one code that is near to solve my problem, but it only takes out all of the items at once and not one by one so I tried a new approche and here is where I hit the wall)
SPIDER.PY
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
for date in response.xpath('//pubDate/text()').extract():
yield WebspiderItem(date = date)
for title in response.xpath('//title/text()').extract():
yield WebspiderItem(title = title)
for summary in response.xpath('//description/text()').extract():
yield WebspiderItem(summary = summary)
for link in response.xpath('//link/text()').extract():
yield WebspiderItem(link = link)
ITEMS.PY
import scrapy
class WebspiderItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
PIPELINES.PY
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='HIDDENPASSWORD',
database='news_db'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_tb""")
self.curr.execute("""create table news_tb(
date text,
title text,
summary text,
link text
)""")
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
Response
Multiple of these:
2020-03-17 07:54:32 [scrapy.core.scraper] ERROR: Error processing {'link': 'https://www.coindesk.com/makerdaos-problems-are-a-textbook-case-of-governance-failure'}
Traceback (most recent call last):
File "c:\users\r\pycharmprojects\project\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 36, in process_item
self.store_db(item)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 41, in store_db
item['date'],
File "c:\users\r\pycharmprojects\_project\venv\lib\site-packages\scrapy\item.py", line 91, in __getitem__
return self._values[key]
KeyError:
you should yield all the data once, don't do it while on loop, python reads code from top to bottom, you yield the date first and the pipelines received it and try to find the value title, summary and link and its missing now returns KeyError
class NewsSpider(scrapy.Spider):
name = 'news'
def start_requests(self):
page = 'https://www.coindesk.com/feed'
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
links = response.xpath('//link/text()').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
def parse_contents(self, response):
url = response.url
article_title = response.xpath('//h1/text()').extract()[0]
pub_date = response.xpath('//div[#class="article-hero-datetime"]/time/#datetime').extract()[0]
description = response.xpath('//meta[#name="description"]/#content').extract()[0]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = article_title
item['summary'] = description
item['link'] = url
yield item
I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"
Ok, going to keep this short, need to rush off for a meeting
I am trying to get the start urls in scrapy and no matter how i try, i can't seem to accomplish it. Here is my code(spider).
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
for i in range(1, 230):
yield self.make_requests_from_url("http://www.snipplr.com/view/%d" % i)
def make_requests_from_url(self, url):
item = DmozItem()
# assign url
item['link'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
#Rules only apply before
rules = (
Rule (LxmlLinkExtractor(deny_domains=('http:\/\/www.snipplr.com\/snippet-not-found\/',)),callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath('//div[#class="post"]/h1/text()').extract()
#start_url
item['link'] = response.url
I have tried all ways, and till now, i get a "h" in my database, the url column.
This is my database :
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='ws', passwd='ps', db='ws')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
As u can see from here,
it is clearly working.
How would i get the start url or rather how would i prase it. I believe h means that the field is empty. Database is mysql.
Thanks for your reading and for your help
Regards,
Charlie
item['link'], as opposed to item['title'], is just a string, not a list:
self.cursor.execute("INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link']))
I have a spider and pipeline
and write a code to get extract data from web and insert to MySQL
Which is running
class AmazonAllDepartmentSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = [
"http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414"
]
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
# pop() removes [u''] tag from
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
and
class AmazoncrawlerPipeline(object):
host = 'qwerty.com'
user = 'qwerty'
password = 'qwerty123'
db = 'amazon_project'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductTitle,ProductDepartmentLilnk)
VALUES (%s,%s)""",
(item['title'],'amazon.com' + str(item.get('link'))))
self.connection.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Now I want to get those data (which is a link of URL)
And again call spider to extract data from web
please help me how to do it Thanks
It should be solved on the spider level.
To follow the links, you can yield a Request after yielding an item instance:
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
yield Request(item['link'], callback=self.parse_link)
Alternatively, you can change the tactics and switch to Link Extractors.
UPD (after discussion in comments):
If you have the links in the database already, you would need to start another spider, read the links from the database in start_requests() and yield requests:
from scrapy.http import Request
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
def start_requests(self):
connection = MySQLdb.connect(<connection params here>)
cursor = connection.cursor()
cursor.execute("SELECT ProductDepartmentLilnk FROM amazon_project.ProductDepartment")
links = cursor.fetchall()
for link in links:
yield Request(link, callback=self.parse)
cursor.close()
...
If you look here I could not get two different spiders to automatically add the results to a mysql database. Now I've added an if and elif statement and they work but they miss out some results, previously there were 52 rows in the bath table, now there is only 41. Bristol used to have 154 now only 141. I cannot think why the results are not the same.
Pipelines.py
import sys
import MySQLdb
import MySQLdb.cursors
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class TestPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='user',
passwd='pwd',
db='db',
host='host',
charset='utf8',
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
if 'BristolQualification' in item:
self.cursor.execute("""INSERT INTO Bristol(BristolCountry, BristolQualification) VALUES ('{0}', '{1}')""".format(item['BristolCountry'], "".join([s.encode('utf8') for s in item['BristolQualification']])))
elif 'BathQualification' in item:
self.cursor.execute("""INSERT INTO Bath(BathCountry, BathQualification) VALUES ('{0}', '{1}')""".format(item['BathCountry'], "".join([s.encode('utf8') for s in item['BathQualification']])))
self.conn.commit()
return item
except MySQLdb.Error as e:
print "Error %d: %s" % (e.args[0], e.args[1])
Items.py
from scrapy.item import Item, Field
class QualificationItem(Item):
BristolQualification = Field()
BristolCountry = Field()
BathQualification = Field()
BathCountry = Field()
Bristol.py
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'bristol'
allowed_domains = ['bristol.ac.uk/']
start_urls = ['http://www.bristol.ac.uk/international/countries/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '//*[#id="all-countries"]/li/ul/li/a/#href'
a_of_the_link = '//*[#id="all-countries"]/li/ul/li/a/text()'
for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
yield Request(urljoin(response.url, link),
meta={'a_of_the_link': text},
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h2[normalize-space(.)="Entry requirements for undergraduate courses"]
/following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="Entry requirements for undergraduate courses"])]
"""
item['BristolQualification'] = hxs.select(xpath).extract()[1:]
item['BristolCountry'] = response.meta['a_of_the_link']
return item
If you look here an user did try to fix the problem but was unsuccessful an I haven't heard from him since.
'These errors were caused by unescaped single quotes in the BristolQualification item field (and presumably the Bath spider suffers from the same problem) causing havoc (such as d'Etudes in the snippet below):'
This is what he thought the problem was.
Can anyone see where the problem is at?