Web spider not returning all results - python

If you look here I could not get two different spiders to automatically add the results to a mysql database. Now I've added an if and elif statement and they work but they miss out some results, previously there were 52 rows in the bath table, now there is only 41. Bristol used to have 154 now only 141. I cannot think why the results are not the same.
Pipelines.py
import sys
import MySQLdb
import MySQLdb.cursors
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class TestPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='user',
passwd='pwd',
db='db',
host='host',
charset='utf8',
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
if 'BristolQualification' in item:
self.cursor.execute("""INSERT INTO Bristol(BristolCountry, BristolQualification) VALUES ('{0}', '{1}')""".format(item['BristolCountry'], "".join([s.encode('utf8') for s in item['BristolQualification']])))
elif 'BathQualification' in item:
self.cursor.execute("""INSERT INTO Bath(BathCountry, BathQualification) VALUES ('{0}', '{1}')""".format(item['BathCountry'], "".join([s.encode('utf8') for s in item['BathQualification']])))
self.conn.commit()
return item
except MySQLdb.Error as e:
print "Error %d: %s" % (e.args[0], e.args[1])
Items.py
from scrapy.item import Item, Field
class QualificationItem(Item):
BristolQualification = Field()
BristolCountry = Field()
BathQualification = Field()
BathCountry = Field()
Bristol.py
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'bristol'
allowed_domains = ['bristol.ac.uk/']
start_urls = ['http://www.bristol.ac.uk/international/countries/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '//*[#id="all-countries"]/li/ul/li/a/#href'
a_of_the_link = '//*[#id="all-countries"]/li/ul/li/a/text()'
for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
yield Request(urljoin(response.url, link),
meta={'a_of_the_link': text},
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h2[normalize-space(.)="Entry requirements for undergraduate courses"]
/following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="Entry requirements for undergraduate courses"])]
"""
item['BristolQualification'] = hxs.select(xpath).extract()[1:]
item['BristolCountry'] = response.meta['a_of_the_link']
return item
If you look here an user did try to fix the problem but was unsuccessful an I haven't heard from him since.
'These errors were caused by unescaped single quotes in the BristolQualification item field (and presumably the Bath spider suffers from the same problem) causing havoc (such as d'Etudes in the snippet below):'
This is what he thought the problem was.
Can anyone see where the problem is at?

Related

Scrapy response uniform blank rows making it impossible to format response output

I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item

Scrapy & MySQL database:

I'm trying to scrape a website with Scrapy and store the information to a MySQL database.
My spider works on it's own. When I use it alongside a pipeline I get two errors
[twisted] CRITICAL: Unhandled error in Deferred:
TypeError: connect() argument 1 must be string, not None
I'm new to python and scrapy.
Here is my Spider:
from scrapy import Spider
#from scrapy.selector import Selector
from scraper.items import ExpertScraperItem
class expertSpider(Spider):
name = 'expert'
start_urls = [
'https://www.expert.ie/products/home-appliances/laundry',
]
def parse(self, response):
# follow links to product pages
for href in response.css('a.product-list-link::attr(href)'):
yield response.follow(href, self.parse_product)
# follow pagination links
for href in response.css('a.UnselectedPage::attr(href)'):
yield response.follow(href, self.parse)
def parse_product(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
item = ExpertScraperItem()
item['model'] = extract_with_css('header strong::text')[12:100].strip()
item['price'] = extract_with_css('span.TotalPrice::text')[1:100].strip()
yield item
Here is my pipeline (this has been updated to show errors corrected 21/01/2018)
import scrapy
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class ScraperPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
#classmethod
def from_settings(cls, settings):
dbargs = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
port = settings['MYSQL_PORT'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline the default call
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
return item
#Each row is updated or written to the database
def _do_upinsert(self, conn, item, spider):
valid = True
for data in item:
if not data:
valid = False
# raise DropItem("Missing {0}!".format(data))
# print "Missing data"
if valid:
result = conn.execute("""
insert into pricing(model, price)
values(%s, %s)
""", (item['model'], item['price']))
if result:
print "added a model into db"
else:
print "failed insert into pricing"

Trouble downloading images with Scrapy - works sometimes

My spider code has been working well so far, but now when I am trying to run a batch of these spiders, everything works except that for some spiders, scrapy downloads the images, and for the rest nothing. All the spiders are the same except for the start_urls. Any help is appreciated!
Here's my pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DmozPipeline(object):
def process_item(self, item, spider):
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
for nlabel in item['nlabel']:
yield Request(nlabel)
print item['image_urls']
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py:
BOT_NAME = 'dmoz2'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['dmoz2.spiders']
NEWSPIDER_MODULE = 'dmoz2.spiders'
DEFAULT_ITEM_CLASS = 'dmoz2.items.DmozItem'
ITEM_PIPELINES = ['dmoz2.pipelines.MyImagesPipeline']
IMAGES_STORE = '/ps/dmoz2/images'
IMAGES_THUMBS = {
#letting height be variable
#'small': ('', 120),
'small': (120, ''),
#'big': ('', 240),
'big': (300, ''),
}
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
items.py:
from scrapy.item import Item, Field
from scrapy.utils.python import unicode_to_str
def u_to_str(text):
unicode_to_str(text,'latin-1','ignore')
class DmozItem(Item):
category_ids = Field()
....
image_urls = Field()
image_paths = Field()
pass
myspider.py:
from scrapy.spider import BaseSpider
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from scrapy import Selector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from dmoz2.items import DmozItem
class DmozSpider(Spider):
name = "fritos_jun2015"
allowed_domains = ["walmart.com"]
start_urls = [
"http://www.walmart.com/ip/Fritos-Bar-B-Q-Flavored-Corn-Chips-9.75- oz/36915853",
"http://www.walmart.com/ip/Fritos-Corn-Chips-1-oz-6-count/10900088",
]
def parse(self, response):
hxs = Selector(response)
sites = hxs.xpath('/html/body/div[1]/section/section[4]/div[2]')
items = []
for site in sites:
item = DmozItem()
item['category_ids'] = ''
.....
item['image_urls'] = site.xpath('div[1]/div[3]/div[1]/div/div/div[2]/div/div/div[1]/div/div/img[2]/#src').extract()
items.append(item)
return items
Would really like to know why this same spider fetches images sometimes, and at other times not. All the spiders are the same, except for the start_urls from the same allowed_domain. Also the images are all absolute path, and the path is correct.
Thanks in advance.
-TM
When screen scraping one problem that is common is that the server will cut the connection because you are trying to access it too often (to prevent screen scrapers from inadvertently ddosing their website and to prevent costs from going to high because someone pings their website every millisecond etc).
Try adding a
sleep()
method between every request to the walmart page. This way you wont get blocked from accessing the server.

Scrapy Getting Start_Urls

Ok, going to keep this short, need to rush off for a meeting
I am trying to get the start urls in scrapy and no matter how i try, i can't seem to accomplish it. Here is my code(spider).
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
for i in range(1, 230):
yield self.make_requests_from_url("http://www.snipplr.com/view/%d" % i)
def make_requests_from_url(self, url):
item = DmozItem()
# assign url
item['link'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
#Rules only apply before
rules = (
Rule (LxmlLinkExtractor(deny_domains=('http:\/\/www.snipplr.com\/snippet-not-found\/',)),callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath('//div[#class="post"]/h1/text()').extract()
#start_url
item['link'] = response.url
I have tried all ways, and till now, i get a "h" in my database, the url column.
This is my database :
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='ws', passwd='ps', db='ws')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
As u can see from here,
it is clearly working.
How would i get the start url or rather how would i prase it. I believe h means that the field is empty. Database is mysql.
Thanks for your reading and for your help
Regards,
Charlie
item['link'], as opposed to item['title'], is just a string, not a list:
self.cursor.execute("INSERT INTO items (title, url) VALUES (%s, %s)",
(item['title'][0], item['link']))

Scrapy pipeline.py not inserting items to MYSQL from spider

I am using scrapy for scraping news headlines and I am a rookie for scrapy and scraping as a whole. I am having huge issues for a few days now pipelining my scraped data into my SQL db.
I have 2 classes in my pipelines.py file one for inserting items to Database and another for backing up scraped data into json file for front end web development reasons.
This is the code for my spider
- its extracting news headlines from the start_urls
- it picks up this data as strings using extract() and later on looping through all of them and using strip() to remove white spaces for better formatting
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from Aljazeera.items import AljazeeraItem
from datetime import date, datetime
class AljazeeraSpider(Spider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
"http://www.aljazeera.com/news/europe/",
"http://www.aljazeera.com/news/middleeast/",
"http://www.aljazeera.com/news/asia/",
"http://www.aljazeera.com/news/asia-pacific/",
"http://www.aljazeera.com/news/americas/",
"http://www.aljazeera.com/news/africa/",
"http://blogs.aljazeera.com/"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[#valign="bottom"]')
contents = sel.xpath('//div[#class="indexSummaryText"]')
items = []
for site,content in zip(sites, contents):
item = AljazeeraItem()
item['headline'] = site.xpath('div[3]/text()').extract()
item['content'] = site.xpath('div/a/text()').extract()
item['date'] = str(date.today())
for headline, content in zip(item['content'], item['headline']):
item['headline'] = headline.strip()
item['content'] = content.strip()
items.append(item)
return items
The Code for my pipeline.py is as follows :
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
import json
import os.path
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
#log runs into back file
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('backDataOfScrapes.json', "w")
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write("item === " + line)
return item
And the settings.py is as follows :
BOT_NAME = 'Aljazeera'
SPIDER_MODULES = ['Aljazeera.spiders']
NEWSPIDER_MODULE = 'Aljazeera.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Aljazeera (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'Aljazeera.pipelines.JsonWriterPipeline': 300,
'Aljazeera.pipelines.SQLStore': 300,
}
My sql setting are all ok. and after running scrapy crawl aljazeera it works and even outputs the items in json format as follows :
item === {"headline": "Turkey court says Twitter ban violates rights", "content": "Although ruling by Turkey's highest court is binding, it is unclear whether the government will overturn the ban.", "date": "2014-04-02"}
i really dont know or cant see what I am missing here. I would really appreciate if u guys could help me out.
Thanks for your time,
Your indentation is wrong in the SQLStore pipeline. I've tested with correct indentation and its working fine. Copy the below and it should be perfect.
class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='', db='aj_db', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
#log data to json file
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped_data(headlines, contents, dates) VALUES (%s, %s, %s)""", (item['headline'].encode('utf-8'), item['content'].encode('utf-8'), item['date'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item

Categories

Resources