Scrapy & MySQL database: - python

I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"

Related

Scrapy append duplicate keys to mongodb list

I have made a scraper with scrapy to extract data from google patents based on chemicals search. I search chemicals like this: O1C(=CCCC1C)C and I extract the publication number from the search results and store them in a mongodb database. The structure of the collection is this:
{ "_id" : ObjectId("6123733f10bd1504a29a9c75"),
"chemical" : "O=C(NCC1N(CC)CCC1)C2=C(O)C(=CC(Cl)=C2OC)CC",
"publication_number" : ["EP3185946B1", "US10751533B2"]
}
The problem is this: If a chemical returns more than one page, in mongodb will store the same chemical two times but with different publication numbers. What I want to do is to check if the chemical exists in mongodb and if it will, I want to append the publication numbers in the already exist chemical record, otherwise if the chemical does not exist, I want to store it in the database
scraper.py
from pathlib import Path
import scrapy
from scrapy_splash import SplashRequest
from pattents.items import PattentsItem
BASE_DIR = Path(__file__).resolve().parent.parent
class PattentLinksSpider(scrapy.Spider):
name = 'pattent_links'
allowed_domains = ['patents.google.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
while not splash:select('#resultsContainer') do
splash:wait(3)
end
splash:wait(4)
return splash:html()
end
'''
def start_requests(self):
with open(BASE_DIR.joinpath('spiders/urls.txt'), "rt") as f:
start_urls = [url.strip().replace('=', '%3d') for url in f.readlines()]
for url in start_urls:
yield SplashRequest(
url='https://patents.google.com/?q=CL%3d' + url + '&page=0&num=100',
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'page_number': 0,
'chemical': url
}
)
def parse(self, response):
items = response.xpath('//search-result-item')
if len(items) > 0:
item = PattentsItem()
pn = response.xpath('//span[#class="style-scope search-result-item"]/text()').getall()
item['chemical'] = response.meta['chemical'].replace('%3d', '=')
item['publication_number'] = pn
yield item
page_number = int(response.meta['page_number']) + 1
yield SplashRequest(
url= response.url.replace(f'&page={page_number - 1}', f'&page={page_number}'),
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'chemical': response.meta['chemical'],
'page_number': page_number
}
)
pipelines.py
import pymongo
from itemadapter import ItemAdapter
class PattentsPipeline(object):
collection_name = 'items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item

Yield items with scrapy

I'm having trouble with my spider, the way I have set it up doesn't seem to work. The spider should be able to scrape multiple pages (1,2,3), all on the same website. I'm not sure if I should do a for loop or an if/else statement so extract all the data?
I'm getting this code after I run it: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min).
Any help would be greatly appreciated!
Shown below are the code for the spider, items.py, and pipelines.py:
class abcSpider(scrapy.Spider):
name = 'abc'
page_number = 2
allowed_domains = ['']
def parse(self, response):
items = folder1Item()
deal_number_var = response.css(".mclbEl a::text").extract()
deal_type_var = response.css('.#ContentContainer1_ctl00_Content_ListCtrl1_LB1_VDTBL .mclbEl:nth-child(9)').css('::text').extract()
items['deal_number_var'] = deal_number_var
items['deal_type_var'] = deal_type_var
yield items
next_page = '' + str(abcSpider.page_number) + '/'
if abcSpider.page_number < 8:
abcSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
This is my items.py page:
import scrapy
class folder1Item(scrapy.Item):
deal_number_var = scrapy.Field()
deal_type_var = scrapy.Field()
I would like to save the data as a .db file to import into sqlite3. It looks like this in my pipelines.py:
import sqlite3
class folder1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("abc.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS abc_tb""")
self.curr.execute("""create table abc_tb(deal_number_var text, deal_type_var text)""")
def process_item(self, items, spider):
self.store_db(items)
return items
def store_db(self,items):
self.curr.execute("""insert into abc_tb values (?,?,?)""" , (items['deal_number_var'][0], items['deal_type_var'][0]))
self.conn.commit()
Middleware.py code:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
I assume this is your entire code? If so: you did not define any start_urls. Furthermore you either have to set the allowed_domains correctly or remove the variable completely because right now you define that no url is allowed.

Scrapy Getting Start_Urls

Ok, going to keep this short, need to rush off for a meeting
I am trying to get the start urls in scrapy and no matter how i try, i can't seem to accomplish it. Here is my code(spider).
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
for i in range(1, 230):
yield self.make_requests_from_url("http://www.snipplr.com/view/%d" % i)
def make_requests_from_url(self, url):
item = DmozItem()
# assign url
item['link'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
#Rules only apply before
rules = (
Rule (LxmlLinkExtractor(deny_domains=('http:\/\/www.snipplr.com\/snippet-not-found\/',)),callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath('//div[#class="post"]/h1/text()').extract()
#start_url
item['link'] = response.url
I have tried all ways, and till now, i get a "h" in my database, the url column.
This is my database :
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='ws', passwd='ps', db='ws')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
As u can see from here,
it is clearly working.
How would i get the start url or rather how would i prase it. I believe h means that the field is empty. Database is mysql.
Thanks for your reading and for your help
Regards,
Charlie
item['link'], as opposed to item['title'], is just a string, not a list:
self.cursor.execute("INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link']))

How to get data from mysql and extract data from web using spider in scrapy

I have a spider and pipeline
and write a code to get extract data from web and insert to MySQL
Which is running
class AmazonAllDepartmentSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = [
"http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414"
]
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
# pop() removes [u''] tag from
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
and
class AmazoncrawlerPipeline(object):
host = 'qwerty.com'
user = 'qwerty'
password = 'qwerty123'
db = 'amazon_project'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductTitle,ProductDepartmentLilnk)
VALUES (%s,%s)""",
(item['title'],'amazon.com' + str(item.get('link'))))
self.connection.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Now I want to get those data (which is a link of URL)
And again call spider to extract data from web
please help me how to do it Thanks
It should be solved on the spider level.
To follow the links, you can yield a Request after yielding an item instance:
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
yield Request(item['link'], callback=self.parse_link)
Alternatively, you can change the tactics and switch to Link Extractors.
UPD (after discussion in comments):
If you have the links in the database already, you would need to start another spider, read the links from the database in start_requests() and yield requests:
from scrapy.http import Request
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
def start_requests(self):
connection = MySQLdb.connect(<connection params here>)
cursor = connection.cursor()
cursor.execute("SELECT ProductDepartmentLilnk FROM amazon_project.ProductDepartment")
links = cursor.fetchall()
for link in links:
yield Request(link, callback=self.parse)
cursor.close()
...

Scrapy pipeline.py not inserting items to MYSQL from spider

I am using scrapy for scraping news headlines and I am a rookie for scrapy and scraping as a whole. I am having huge issues for a few days now pipelining my scraped data into my SQL db.
I have 2 classes in my pipelines.py file one for inserting items to Database and another for backing up scraped data into json file for front end web development reasons.
This is the code for my spider
- its extracting news headlines from the start_urls
- it picks up this data as strings using extract() and later on looping through all of them and using strip() to remove white spaces for better formatting
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from Aljazeera.items import AljazeeraItem
from datetime import date, datetime
class AljazeeraSpider(Spider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
"http://www.aljazeera.com/news/europe/",
"http://www.aljazeera.com/news/middleeast/",
"http://www.aljazeera.com/news/asia/",
"http://www.aljazeera.com/news/asia-pacific/",
"http://www.aljazeera.com/news/americas/",
"http://www.aljazeera.com/news/africa/",
"http://blogs.aljazeera.com/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[#valign="bottom"]')
contents = sel.xpath('//div[#class="indexSummaryText"]')
items = []
for site,content in zip(sites, contents):
item = AljazeeraItem()
item['headline'] = site.xpath('div[3]/text()').extract()
item['content'] = site.xpath('div/a/text()').extract()
item['date'] = str(date.today())
for headline, content in zip(item['content'], item['headline']):
item['headline'] = headline.strip()
item['content'] = content.strip()
items.append(item)
return items
The Code for my pipeline.py is as follows :
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
import json
import os.path
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
#log runs into back file
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('backDataOfScrapes.json', "w")
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write("item === " + line)
return item
And the settings.py is as follows :
BOT_NAME = 'Aljazeera'
SPIDER_MODULES = ['Aljazeera.spiders']
NEWSPIDER_MODULE = 'Aljazeera.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Aljazeera (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'Aljazeera.pipelines.JsonWriterPipeline': 300,
'Aljazeera.pipelines.SQLStore': 300,
}
My sql setting are all ok. and after running scrapy crawl aljazeera it works and even outputs the items in json format as follows :
item === {"headline": "Turkey court says Twitter ban violates rights", "content": "Although ruling by Turkey's highest court is binding, it is unclear whether the government will overturn the ban.", "date": "2014-04-02"}
i really dont know or cant see what I am missing here. I would really appreciate if u guys could help me out.
Thanks for your time,
Your indentation is wrong in the SQLStore pipeline. I've tested with correct indentation and its working fine. Copy the below and it should be perfect.
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item

Categories

Resources