mess word when used scrapy to save json data to csv - python

I used scrapy to crawl comment data from the website http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0, which language is chinese. But i just got the output like this.
And the csv file's output is all messed up.
I don't know what happened. First i thought that was a json decode or json encode problem, then i tried the ways on the internet but i got the same result. Here's my code:
#!/usr/bin/env python
# encoding: utf-8
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from jd_comment.items import JdCommentItem
import json
class JdSpider(scrapy.Spider):
name = 'comment'
def start_requests(self):
url = 'http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
for comment in jsonresponse['comments']:
item = JdCommentItem()
item['username'] = comment['nickname']
item['user_ID'] = comment['id']
item['time'] = comment['referenceTime']
item['good_ID'] = comment['referenceId']
item['good_name'] = comment['referenceName']
item['content'] = comment['content']
item['score'] = comment['score']
items.append(item)
yield item
Anyone give me a hint would be highly appreciated. Thanks.

Related

Scrapy not identifying key from json

I am trying to scrape the information pertaining to the biblical commentaries off of a website. Below is the code I have made to do so. start_urls is the link to the json file I am trying to scrape. I chose ['0']['father']['_id'] to get the name of the commenter, however, the following error occurs. What should I do?
Error: TypeError: list indices must be integers or slices, not str
Code:
import scrapy
import json
class catenaspider(scrapy.Spider): #spider to crawl the url
name = 'commentary' #name to be called in command terminal
start_urls = ['https://api.catenabible.com:8080/anc_com/c/mt/1/1?tags=[%22ALL%22]&sort=def']
def parse(self,response):
data = json.loads(response.body)
yield from data['0']['father']['_id']```
Read the documentation again.
import scrapy
class catenaspider(scrapy.Spider): # spider to crawl the url
name = 'commentary' # name to be called in command terminal
start_urls = ['https://api.catenabible.com:8080/anc_com/c/mt/1/1?tags=[%22ALL%22]&sort=def']
def parse(self, response):
data = response.json()
yield {'id_father': data[0]['father']['_id']}
# if you want to get all the id's
# for d in data:
# yield {'id_father': d['father']['_id']}

Getting this error when scraping JSON with scrapy: Spider must return request, item, or None, got 'str'

I am trying to get a json field with key "longName" with scrapy but I am receiving the error: "Spider must return request, item, or None, got 'str'".
The JSON I'm trying to scrape looks something like this:
{
"id":5355,
"code":9594,
}sadsadsd
This is my code:
import scrapy
import json
class NotesSpider(scrapy.Spider):
name = 'notes'
allowed_domains = ['blahblahblah.com']
start_urls = ['https://blahblahblah.com/api/123']
def parse(self, response):
data = json.loads(response.body)
yield from data['longName']
I get the above error when I run "scrapy crawl notes" in prompt. Anyone can point me in the right direction?
If you only want longName modifying your parse method like this should do the trick:
def parse(self, response):
data = json.loads(response.body)
yield {"longName": data["longName"]}

Order a json by field using scrapy

I have created a spider to scrape problems from projecteuler.net. Here I have concluded my answer to a related question with
I launch this with the command scrapy crawl euler -o euler.json and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I'm going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.
But unfortunately, ordering items to write in json by scrapy (I need ascending order by id field) seem not to be so simple. I've studied every single component (middlewares, pipelines, exporters, signals, etc...) but no one seems useful for this purpose. I'm arrived at the conclusion that a solution to solve this problem doesn't exist at all in scrapy (except, maybe, a very elaborated trick), and you are forced to order things in a second phase. Do you agree, or do you have some idea? I copy here the code of my scraper.
Spider:
# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
class EulerSpider(scrapy.Spider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
def parse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i in range(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
Item:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
def extract_first_number(text):
i = re.search('\d+', text)
return int(text[i.start():i.end()])
def array_to_value(element):
return element[0]
class Problem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
If I needed my output file to be sorted (I will assume you have a valid reason to want this), I'd probably write a custom exporter.
This is how Scrapy's built-in JsonItemExporter is implemented.
With a few simple changes, you can modify it to add the items to a list in export_item(), and then sort the items and write out the file in finish_exporting().
Since you're only scraping a few hundred items, the downsides of storing a list of them and not writing to a file until the crawl is done shouldn't be a problem to you.
By now I've found a working solution using pipeline:
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
def close_spider(self, spider):
ordered_list = [None for i in range(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
def process_item(self, item, spider):
self.list_items.append(item)
return item
Though it may be non optimal, because the guide suggests in another example:
The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports.

Scrapy generate csv file (UTF-8)

I try to generate a CSV file with the result of the crawler. Because it is German, I need to have it UTF-8 encoded (ä,ö, etc.). This is my result so far:
spider.py
import scrapy
from scrapy.spiders import BaseSpider
from scrapy.selector import Selector
from Polizeimeldungen.items import PolizeimeldungenItem
class PoliceSpider(scrapy.Spider):
name = "pm"
allowed_domains = ["berlin.de"]
start_urls =
["https://www.berlin.de/polizei/polizeimeldungen/archiv/2014/?page_at_1_0=1"]
def parse(self, response):
for sel in response.css('.row-fluid'):
item = PolizeimeldungenItem()
item['title'] = sel.css('a ::text').extract_first().encode('utf-8')
item['link'] = sel.css('a ::text').extract_first().encode('utf-8') // this is wrong, but it is easy to fix
yield item
items.py
import scrapy
class PolizeimeldungenItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pipelines.py
import csv
class PolizeimeldungenPipeline(object):
def __init__(self):
self.myCsv = csv.writer(open('Item.csv', 'wb'))
self.myCsv.writerow(['title', 'link'])
def process_item(self, item, spider):
self.myCsv.writerow([item['title'], item['link']])
return item
Settings.py
BOT_NAME = 'Polizeimeldungen'
SPIDER_MODULES = ['Polizeimeldungen.spiders']
NEWSPIDER_MODULE = 'Polizeimeldungen.spiders'
ITEM_PIPELINES = {'Polizeimeldungen.pipelines.PolizeimeldungenPipeline': 100}
AS the result after:
scrapy crawl pm
I get this error message:
TypeError: a bytes-like object is required, not 'str'
Thanks for your help!!
UPDATE: Python 3.6.0 :: Anaconda 4.3.1
I assume that you are using Python 3 (this solution won't work with Python 2).
You need to change two things:
Open the output file in text mode, with the desired output encoding.
In the PolizeimeldungenPipeline's constructor, write:
self.myCsv = csv.writer(open('Item.csv', 'w', encoding='utf-8'))
Don't encode the cells (as in PoliceSpider.parse):
item['title'] = sel.css('a ::text').extract_first()
etc.

Scrapy only scrapes the first start url in a list of 15 start urls

I am new to Scrapy and am attempting to teach myself the basics. I have compiled a code that goes to the Louisiana Department of Natural Resources website to retrieve the serial number for certain oil wells.
I have each well's link listed in the start URLs command, but scrappy only downloads data from the first url. What am I doing wrong?
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from mike.items import MikeItem
class SonrisSpider(Spider):
name = "sspider"
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def parse(self, response):
item = MikeItem()
item['serial'] = response.xpath('/html/body/table[1]/tr[2]/td[1]/text()').extract()[0]
yield item
Thank you for any help you might be able to provide. If I have not explained my problem thoroughly, please let me know and I will attempt to clarify.
I think this code might help,
By default scrapy prevent duplicate requests. Since only the parameters are different in your start-url scrapy will consider the rest of the urls in the start-url as duplicate request of the first one. That's why your spider stops after fetching the first url. In order to parse the rest of the urls we have enable dont_filter flag in the scrapy request. (chek the start_request())
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from mike.items import MikeItem
class SonrisSpider(scrapy.Spider):
name = "sspider"
allowed_domains = ["sonlite.dnr.state.la.us"]
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
item = MikeItem()
serial = response.xpath(
'/html/body/table[1]/tr[2]/td[1]/text()').extract()
serial = serial[0] if serial else 'n/a'
item['serial'] = serial
yield item
sample output returned by this spider is as follows,
{'serial': u'207899'}
{'serial': u'971683'}
{'serial': u'214206'}
{'serial': u'159420'}
{'serial': u'248942'}
{'serial': u'243671'}
your code sounds good, try to add this function
class SonrisSpider(Spider):
def start_requests(self):
for url in self.start_urls:
print(url)
yield self.make_requests_from_url(url)
#the result of your code goes here
The URLs should be printed now. Test it, if not, say please

Categories

Resources