I have been trying to scrape a news site to store each article in mySQL database. My goal is to store the following data for each article on the news site: date, title, summary, link
I been trying different methods and decided after trying for a few weeks to come here on stackoverflow to get a solution to my problem. (Note: I have one code that is near to solve my problem, but it only takes out all of the items at once and not one by one so I tried a new approche and here is where I hit the wall)
SPIDER.PY
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
for date in response.xpath('//pubDate/text()').extract():
yield WebspiderItem(date = date)
for title in response.xpath('//title/text()').extract():
yield WebspiderItem(title = title)
for summary in response.xpath('//description/text()').extract():
yield WebspiderItem(summary = summary)
for link in response.xpath('//link/text()').extract():
yield WebspiderItem(link = link)
ITEMS.PY
import scrapy
class WebspiderItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
PIPELINES.PY
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='HIDDENPASSWORD',
database='news_db'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_tb""")
self.curr.execute("""create table news_tb(
date text,
title text,
summary text,
link text
)""")
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
Response
Multiple of these:
2020-03-17 07:54:32 [scrapy.core.scraper] ERROR: Error processing {'link': 'https://www.coindesk.com/makerdaos-problems-are-a-textbook-case-of-governance-failure'}
Traceback (most recent call last):
File "c:\users\r\pycharmprojects\project\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 36, in process_item
self.store_db(item)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 41, in store_db
item['date'],
File "c:\users\r\pycharmprojects\_project\venv\lib\site-packages\scrapy\item.py", line 91, in __getitem__
return self._values[key]
KeyError:
you should yield all the data once, don't do it while on loop, python reads code from top to bottom, you yield the date first and the pipelines received it and try to find the value title, summary and link and its missing now returns KeyError
class NewsSpider(scrapy.Spider):
name = 'news'
def start_requests(self):
page = 'https://www.coindesk.com/feed'
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
links = response.xpath('//link/text()').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
def parse_contents(self, response):
url = response.url
article_title = response.xpath('//h1/text()').extract()[0]
pub_date = response.xpath('//div[#class="article-hero-datetime"]/time/#datetime').extract()[0]
description = response.xpath('//meta[#name="description"]/#content').extract()[0]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = article_title
item['summary'] = description
item['link'] = url
yield item
Related
I am working Google search crawling using scrapy. This is the code and it works well to get search results.
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
title = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
My next step is use "pipeline" on Scrapy to save a csv file for results.
Here is the code that I have written so far.
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is modified my spider code.
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
It has error where in:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
I get this error:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
Here is the working output according to your comment.
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
titles = page.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item
I'm having trouble with my spider, the way I have set it up doesn't seem to work. The spider should be able to scrape multiple pages (1,2,3), all on the same website. I'm not sure if I should do a for loop or an if/else statement so extract all the data?
I'm getting this code after I run it: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min).
Any help would be greatly appreciated!
Shown below are the code for the spider, items.py, and pipelines.py:
class abcSpider(scrapy.Spider):
name = 'abc'
page_number = 2
allowed_domains = ['']
def parse(self, response):
items = folder1Item()
deal_number_var = response.css(".mclbEl a::text").extract()
deal_type_var = response.css('.#ContentContainer1_ctl00_Content_ListCtrl1_LB1_VDTBL .mclbEl:nth-child(9)').css('::text').extract()
items['deal_number_var'] = deal_number_var
items['deal_type_var'] = deal_type_var
yield items
next_page = '' + str(abcSpider.page_number) + '/'
if abcSpider.page_number < 8:
abcSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
This is my items.py page:
import scrapy
class folder1Item(scrapy.Item):
deal_number_var = scrapy.Field()
deal_type_var = scrapy.Field()
I would like to save the data as a .db file to import into sqlite3. It looks like this in my pipelines.py:
import sqlite3
class folder1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("abc.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS abc_tb""")
self.curr.execute("""create table abc_tb(deal_number_var text, deal_type_var text)""")
def process_item(self, items, spider):
self.store_db(items)
return items
def store_db(self,items):
self.curr.execute("""insert into abc_tb values (?,?,?)""" , (items['deal_number_var'][0], items['deal_type_var'][0]))
self.conn.commit()
Middleware.py code:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
I assume this is your entire code? If so: you did not define any start_urls. Furthermore you either have to set the allowed_domains correctly or remove the variable completely because right now you define that no url is allowed.
I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"
Ok, going to keep this short, need to rush off for a meeting
I am trying to get the start urls in scrapy and no matter how i try, i can't seem to accomplish it. Here is my code(spider).
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
for i in range(1, 230):
yield self.make_requests_from_url("http://www.snipplr.com/view/%d" % i)
def make_requests_from_url(self, url):
item = DmozItem()
# assign url
item['link'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
#Rules only apply before
rules = (
Rule (LxmlLinkExtractor(deny_domains=('http:\/\/www.snipplr.com\/snippet-not-found\/',)),callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath('//div[#class="post"]/h1/text()').extract()
#start_url
item['link'] = response.url
I have tried all ways, and till now, i get a "h" in my database, the url column.
This is my database :
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='ws', passwd='ps', db='ws')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
As u can see from here,
it is clearly working.
How would i get the start url or rather how would i prase it. I believe h means that the field is empty. Database is mysql.
Thanks for your reading and for your help
Regards,
Charlie
item['link'], as opposed to item['title'], is just a string, not a list:
self.cursor.execute("INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link']))
I have a spider and pipeline
and write a code to get extract data from web and insert to MySQL
Which is running
class AmazonAllDepartmentSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = [
"http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414"
]
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
# pop() removes [u''] tag from
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
and
class AmazoncrawlerPipeline(object):
host = 'qwerty.com'
user = 'qwerty'
password = 'qwerty123'
db = 'amazon_project'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductTitle,ProductDepartmentLilnk)
VALUES (%s,%s)""",
(item['title'],'amazon.com' + str(item.get('link'))))
self.connection.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Now I want to get those data (which is a link of URL)
And again call spider to extract data from web
please help me how to do it Thanks
It should be solved on the spider level.
To follow the links, you can yield a Request after yielding an item instance:
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
yield Request(item['link'], callback=self.parse_link)
Alternatively, you can change the tactics and switch to Link Extractors.
UPD (after discussion in comments):
If you have the links in the database already, you would need to start another spider, read the links from the database in start_requests() and yield requests:
from scrapy.http import Request
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
def start_requests(self):
connection = MySQLdb.connect(<connection params here>)
cursor = connection.cursor()
cursor.execute("SELECT ProductDepartmentLilnk FROM amazon_project.ProductDepartment")
links = cursor.fetchall()
for link in links:
yield Request(link, callback=self.parse)
cursor.close()
...