Ok, going to keep this short, need to rush off for a meeting
I am trying to get the start urls in scrapy and no matter how i try, i can't seem to accomplish it. Here is my code(spider).
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
for i in range(1, 230):
yield self.make_requests_from_url("http://www.snipplr.com/view/%d" % i)
def make_requests_from_url(self, url):
item = DmozItem()
# assign url
item['link'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
#Rules only apply before
rules = (
Rule (LxmlLinkExtractor(deny_domains=('http:\/\/www.snipplr.com\/snippet-not-found\/',)),callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath('//div[#class="post"]/h1/text()').extract()
#start_url
item['link'] = response.url
I have tried all ways, and till now, i get a "h" in my database, the url column.
This is my database :
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='ws', passwd='ps', db='ws')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
As u can see from here,
it is clearly working.
How would i get the start url or rather how would i prase it. I believe h means that the field is empty. Database is mysql.
Thanks for your reading and for your help
Regards,
Charlie
item['link'], as opposed to item['title'], is just a string, not a list:
self.cursor.execute("INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link']))
Related
I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"
I have a spider and pipeline
and write a code to get extract data from web and insert to MySQL
Which is running
class AmazonAllDepartmentSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = [
"http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414"
]
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
# pop() removes [u''] tag from
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
and
class AmazoncrawlerPipeline(object):
host = 'qwerty.com'
user = 'qwerty'
password = 'qwerty123'
db = 'amazon_project'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductTitle,ProductDepartmentLilnk)
VALUES (%s,%s)""",
(item['title'],'amazon.com' + str(item.get('link'))))
self.connection.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Now I want to get those data (which is a link of URL)
And again call spider to extract data from web
please help me how to do it Thanks
It should be solved on the spider level.
To follow the links, you can yield a Request after yielding an item instance:
def parse(self, response):
for sel in response.xpath('//ul[#class="nav_cat_links"]/li'):
item = AmazoncrawlerItem()
item['title'] = sel.xpath('a/text()').extract().pop()
item['link'] = sel.xpath('a/#href').extract().pop()
item['desc'] = sel.xpath('text()').extract()
yield item
yield Request(item['link'], callback=self.parse_link)
Alternatively, you can change the tactics and switch to Link Extractors.
UPD (after discussion in comments):
If you have the links in the database already, you would need to start another spider, read the links from the database in start_requests() and yield requests:
from scrapy.http import Request
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
def start_requests(self):
connection = MySQLdb.connect(<connection params here>)
cursor = connection.cursor()
cursor.execute("SELECT ProductDepartmentLilnk FROM amazon_project.ProductDepartment")
links = cursor.fetchall()
for link in links:
yield Request(link, callback=self.parse)
cursor.close()
...
I am using scrapy for scraping news headlines and I am a rookie for scrapy and scraping as a whole. I am having huge issues for a few days now pipelining my scraped data into my SQL db.
I have 2 classes in my pipelines.py file one for inserting items to Database and another for backing up scraped data into json file for front end web development reasons.
This is the code for my spider
- its extracting news headlines from the start_urls
- it picks up this data as strings using extract() and later on looping through all of them and using strip() to remove white spaces for better formatting
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from Aljazeera.items import AljazeeraItem
from datetime import date, datetime
class AljazeeraSpider(Spider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
"http://www.aljazeera.com/news/europe/",
"http://www.aljazeera.com/news/middleeast/",
"http://www.aljazeera.com/news/asia/",
"http://www.aljazeera.com/news/asia-pacific/",
"http://www.aljazeera.com/news/americas/",
"http://www.aljazeera.com/news/africa/",
"http://blogs.aljazeera.com/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[#valign="bottom"]')
contents = sel.xpath('//div[#class="indexSummaryText"]')
items = []
for site,content in zip(sites, contents):
item = AljazeeraItem()
item['headline'] = site.xpath('div[3]/text()').extract()
item['content'] = site.xpath('div/a/text()').extract()
item['date'] = str(date.today())
for headline, content in zip(item['content'], item['headline']):
item['headline'] = headline.strip()
item['content'] = content.strip()
items.append(item)
return items
The Code for my pipeline.py is as follows :
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
import json
import os.path
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
#log runs into back file
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('backDataOfScrapes.json', "w")
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write("item === " + line)
return item
And the settings.py is as follows :
BOT_NAME = 'Aljazeera'
SPIDER_MODULES = ['Aljazeera.spiders']
NEWSPIDER_MODULE = 'Aljazeera.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Aljazeera (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'Aljazeera.pipelines.JsonWriterPipeline': 300,
'Aljazeera.pipelines.SQLStore': 300,
}
My sql setting are all ok. and after running scrapy crawl aljazeera it works and even outputs the items in json format as follows :
item === {"headline": "Turkey court says Twitter ban violates rights", "content": "Although ruling by Turkey's highest court is binding, it is unclear whether the government will overturn the ban.", "date": "2014-04-02"}
i really dont know or cant see what I am missing here. I would really appreciate if u guys could help me out.
Thanks for your time,
Your indentation is wrong in the SQLStore pipeline. I've tested with correct indentation and its working fine. Copy the below and it should be perfect.
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
I am on this page: http://www.metacritic.com/browse/games/title/ps4/a?view=condensed
And I want to go into each item and get the Developer and Genre, but my code doesn't seem to work.
For example, I want to go into this page: http://www.metacritic.com/game/playstation-4/angry-birds-star-wars
Then leave it and continue through the rest doing the same and adding to a database. What can I change in my code to make it work? Right now the database is for the dev and genre is null but it gets the rest of the data so it's like it never enters parse_Game
Also I added print statements into parseGame and none of them print
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from metacritic.items import MetacriticItem
import MySQLdb
import re
from string import lowercase
class MetacriticSpider(BaseSpider):
def start_requests(self):
#iterate through ps4 pages
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback = self.parseps4)
#gets the developer and genre of a game
def parseGame(self, response):
print("Here")
item = response.meta['item']
db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
cursor = db1.cursor()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="product_wrap"]')
items = []
item['dev'] = site.xpath('.//span[contains(#class, "summary_detail developer")]/span[1]/text()').extract()
item['genre'] = site.xpath('.//span[contains(#class, "summary_detail product_genre")]/span[1]/text()').extract()
cursor.execute("INSERT INTO ps4 (dev, genre) VALUES (%s,%s)",[item['dev'][0],item['genre'][0]])
items.append(item)
print item['dev']
print item['genre']
def parseps4(self, response):
#some local variables
db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
cursor = db1.cursor()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="product_wrap"]')
items = []
#iterates through each site
for site in sites:
with db1:
item = MetacriticItem()
#sets the item
item['title'] = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/text()').extract()
item['cscore'] = site.xpath('.//div[contains(#class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()
item['uscore'] = site.xpath('.//div/ul/li/span[contains(#class, "data textscore")]/text()').extract()
item['release'] = site.xpath('.//li[contains(#class, "stat release_date full_release_date")]/span[2]/text()').extract()
#some processing to check if there is a score attached, if there is, it adds it to the database
if ("tbd" in item['cscore'][0] and "tbd" not in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" not in item['uscore'][0]):
cursor.execute("INSERT INTO ps4 (title, criticalscore, userscore, releasedate) VALUES (%s,%s,%s, %s)",[(' '.join(item['title'][0].split())).replace("(PS4)","",1),item['cscore'][0],item['uscore'][0],item['release'][0]])
items.append(item)
itemLink = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/#href' ).extract()
req = Request('http://www.metacritic.com' + itemLink[0], callback = self.parseGame)
req.meta['item'] = item
Several problems in the code:
meta argument should contain a dictionary {'item': item}
HtmlXPathSelector is deprecated - use Selector instead
I think you shouldn't do mysql inserts inside the spider - use Database Pipeline instead:
Writing items to a MySQL database in Scrapy
you need to get the first item of extract() call and do strip() on it (this will help to have strings in the Fields, not lists and without leading and trailing spaces and newlines)
Here's the code without mysql related calls:
from string import lowercase
from scrapy.item import Field, Item
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector, Selector
from metacritic.items import MetacriticItem
class MetacriticSpider(BaseSpider):
name = 'metacritic'
allowed_domains = ['metacritic.com']
max_id = 1 # your max_id value goes here!!!
def start_requests(self):
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback=self.parseps4)
def parseGame(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
site = hxs.select('//div[#class="product_wrap"]')
# get additional data!!!
yield item
def parseps4(self, response):
hxs = Selector(response)
sites = hxs.select('//div[#class="product_wrap"]')
for site in sites:
item = MetacriticItem()
item['title'] = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/text()').extract()[0].strip()
item['cscore'] = site.xpath('.//div[contains(#class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()[0].strip()
item['uscore'] = site.xpath('.//div/ul/li/span[contains(#class, "data textscore")]/text()').extract()[0].strip()
item['release'] = site.xpath('.//li[contains(#class, "stat release_date full_release_date")]/span[2]/text()').extract()[0].strip()
link = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/#href').extract()[0]
yield Request('http://www.metacritic.com/' + link, meta={'item': item}, callback=self.parseGame)
It works for me - I see the yielded items from parseGame() on a console.
Make sure it yields items first, then see the !!! comments - fill these lines accordingly.
After that, if you see items on a console, try creating a database pipeline to write items to mysql.
If you look here I could not get two different spiders to automatically add the results to a mysql database. Now I've added an if and elif statement and they work but they miss out some results, previously there were 52 rows in the bath table, now there is only 41. Bristol used to have 154 now only 141. I cannot think why the results are not the same.
Pipelines.py
import sys
import MySQLdb
import MySQLdb.cursors
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class TestPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='user',
passwd='pwd',
db='db',
host='host',
charset='utf8',
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
if 'BristolQualification' in item:
self.cursor.execute("""INSERT INTO Bristol(BristolCountry, BristolQualification) VALUES ('{0}', '{1}')""".format(item['BristolCountry'], "".join([s.encode('utf8') for s in item['BristolQualification']])))
elif 'BathQualification' in item:
self.cursor.execute("""INSERT INTO Bath(BathCountry, BathQualification) VALUES ('{0}', '{1}')""".format(item['BathCountry'], "".join([s.encode('utf8') for s in item['BathQualification']])))
self.conn.commit()
return item
except MySQLdb.Error as e:
print "Error %d: %s" % (e.args[0], e.args[1])
Items.py
from scrapy.item import Item, Field
class QualificationItem(Item):
BristolQualification = Field()
BristolCountry = Field()
BathQualification = Field()
BathCountry = Field()
Bristol.py
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'bristol'
allowed_domains = ['bristol.ac.uk/']
start_urls = ['http://www.bristol.ac.uk/international/countries/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '//*[#id="all-countries"]/li/ul/li/a/#href'
a_of_the_link = '//*[#id="all-countries"]/li/ul/li/a/text()'
for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
yield Request(urljoin(response.url, link),
meta={'a_of_the_link': text},
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h2[normalize-space(.)="Entry requirements for undergraduate courses"]
/following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="Entry requirements for undergraduate courses"])]
"""
item['BristolQualification'] = hxs.select(xpath).extract()[1:]
item['BristolCountry'] = response.meta['a_of_the_link']
return item
If you look here an user did try to fix the problem but was unsuccessful an I haven't heard from him since.
'These errors were caused by unescaped single quotes in the BristolQualification item field (and presumably the Bath spider suffers from the same problem) causing havoc (such as d'Etudes in the snippet below):'
This is what he thought the problem was.
Can anyone see where the problem is at?