How can I call the writeXML after my parser finish crawling data? Currently I can see the data crawl but don't see the output file. I tried to print under writeXML no output too.
Below are my code:
class FriendSpider(BaseSpider):
# identifies of the Spider
name = "friend"
count = 0
allowed_domains = ["example.com.us"]
start_urls = [
"http://example.com.us/biz/friendlist/"
]
def start_requests(self):
for i in range(0,1722,40):
yield self.make_requests_from_url("http://example.com.us/biz/friendlist/?start=%d" % i)
def parse(self, response):
response = response.replace(body=response.body.replace('<br />', '\n'))
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = Item()
self.count += 1
item['id'] = str(self.count)
item['name'] = site.select('.//div/div/h4/text()').extract()
item['address'] = site.select('h4/span/text()').extract()
item['review'] = ''.join(site.select('.//div[#class="review"]/p/text()').extract())
item['birthdate'] = site.select('.//div/div/h5/text()').extract()
items.append(item)
return items
def writeXML(self, items):
root = ET.Element("Test")
for item in items:
item= ET.SubElement(root,'item')
item.set('id', item['id'])
address= ET.SubElement(item, 'address')
address.text = item['address']
user = ET.SubElement(item, 'user')
user.text = item['user']
birthdate= ET.SubElement(item, 'birthdate')
birthdate.text = item['birthdate']
review = ET.SubElement(item, 'review')
review.text = item['review']
# wrap it in an ElementTree instance, and save as XML
file = open("out.xml", 'w')
tree = ET.ElementTree(root)
tree.write(file,xml_declaration=True,encoding='utf-8',method="xml")
To output using the built-in XML exporter, try the following command:
scrapy crawl friend -o items.xml -t xml
If the output isn't to your liking, then you can try creating your own exporter using the XMLExporter class as a basis.
Related
I'm trying to use scrapy to scrape from a site, and a link within the content of the site. However, when I do this I get an error on the line above the yield statemant in parse:
TypeError: 'NoneType' object does not support item assignment
Here is my code:
class PostsSpider(scrapy.Spider):
name = "posts"
start_urls = ['https://www.nba.com/teams/bucks']
allowed_domains = ['nba.com']
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
item = yield scrapy.Request(playerPage, callback=self.helper)
item['number'] = post.css('span.nba-player-trending-item__number::text').get(),
yield item
def helper(self, response):
print("--->"+response.css("title").get())
item = Item()
item['title'] = response.css("title::text").get()
yield item
class Item(scrapy.Item):
# define the fields for your item here like:
number = scrapy.Field()
title = scrapy.Field()
ppg = scrapy.Field()
What you can do is pass number data to helper instead of doing this way.
Something like this:
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
meta = response.meta.copy()
meta['number'] = post.css('span.nba-player-trending-item__number::text').get()
yield scrapy.Request(playerPage, callback=self.helper, meta=meta)
def helper(self, response):
# here you will get `number` in response.meta['number'] that you can yield further.
item = Item()
item['number'] = response.meta.get('number)
yield item
I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[#class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/#href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item
To settings.py, add:
ITEM_PIPELINES = {
'myproject.pipelines.JsonWriterPipeline':100
}
where myproject is the name of your project/folder.
See the very last heading on this page : https://docs.scrapy.org/en/latest/topics/item-pipeline.html
When running a spider inside a script, the settings need to be imported using the method described in the following. Running scrapy from script not including pipeline
I have two scrapy that the first one crawl a sitemap and extract urls and put it in a txt file and the second one reads it and crawl this urls line by line.
my code like bellow :
class sitemapSpider(SitemapSpider):
name = "filmnetmapSpider"
sitemap_urls = ['http://filmnet.ir/sitemap.xml']
sitemap_rules = [
('/series/', 'parse_item')
]
storage_file = 'urls.txt'
def parse_item(self, response):
videoid = response.url
with open(self.storage_file, 'a') as handle:
yield handle.writelines(videoid + '\n')
second spider :
class filmnetSpider(scrapy.Spider):
name = 'filmnetSpider'
def start_requests(self):
with open('urls.txt') as fp:
for line in fp:
yield Request(line.strip(), callback=self.parse_website)
def parse_website(self, response):
hxs = HtmlXPathSelector(response)
url = hxs.xpath('//script[#type="application/ld+json"]/text()').extract()
url = ast.literal_eval(json.dumps(url))
url = url[1]
obj = json.loads(url)
poster = obj['image']
name = obj['name']
description = obj['description']
How to change the code to delete read/write to the file?
How to use callback in it?
Note : This code does not work in one scrapy spider ;code is :Two given scrapy + bellow code ,As an example is said in doc
process = CrawlerProcess()
process.crawl(filmnetSpider)
process.crawl(sitemapSpider)
process.start()
This should work:
class sitemapSpider(SitemapSpider):
name = "filmnetmapSpider"
sitemap_urls = ['http://filmnet.ir/sitemap.xml']
sitemap_rules = [
('/series/', 'parse_item')
]
def parse_item(self, response):
videoid = response.url
yield Request(videoid, callback=self.parse_website)
def parse_website(self, response):
hxs = HtmlXPathSelector(response)
url = hxs.xpath('//script[#type="application/ld+json"]/text()').extract()
url = ast.literal_eval(json.dumps(url))
url = url[1]
obj = json.loads(url)
poster = obj['image']
name = obj['name']
description = obj['description']
I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')
Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.
I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
These are my codes but it seems to be correct,but it doesn't work,please help
HEADER_XPATH = ['//h1[#class="story-body__h1"]//text()']
AUTHOR_XPATH = ['//span[#class="byline__name"]//text()']
PUBDATE_XPATH = ['//div/#data-datetime']
WTAGS_XPATH = ['']
CATEGORY_XPATH = ['//span[#rev="news|source""]//text()']
TEXT = ['//div[#property="articleBody"]//p//text()']
INTERLINKS = ['//div[#class="story-body__link"]//p//a/#href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
sitemap_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse_page(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_array_item(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
Your spider is just badly structured and because of that it does nothing.
The scrapy.Spider spider requires start_urls class attribute which should contains list of urls that the spider will use to start the crawl, all of these urls will callback to class method parse which means it's required as well.
Your spider has sitemap_urls class attribute and it's not being used anywhere, also your spider has parse_page class method that is never used anywhere either.
So in short your spider should look something like this:
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
start_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse(self, response):
# This is a page with all of the articles
article_urls = # find article urls in the pages
for url in article_urls:
yield Request(url, self.parse_page)
def parse_page(self, response):
# This is an article page
items = []
item = ContentItems()
# populate item
return item