RegEx to get URLs from body string

RegEx to get URLs from body string - python

So I was just wondering what my getURLs function's issue might be. I'm trying to get all urls from within the containing body's string.
My crawler isn't crawling anything because my input urls are invalid.
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(#class,'cat')]/a/#href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(#class,'cat')]/a/#href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[#class="pagination next"]/#href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[#class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[#class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[#class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[#id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
The web page is: http://grandisland.backpage.com/FemaleEscorts/?layout=date

Related

Another Scrapy Question: Output to Console but not to .json

This is another newbie scrapy question:
When I first started with the scrapy tutorial linked here:
https://docs.scrapy.org/en/latest/intro/tutorial.html
I can crawl a webpage and then output the scraped content to a json file. But when I modify the tutorial to add a few rules like:
traversal depth
and memory so it doesn't traverse already visited pages again.
The output to the json stops although I can still see the output on the console. Can someone give me pointers on what I am doing wrong? The modifications can be seen below:
class QuotesSpider(scrapy.Spider):
name = "quotes"
#allowed_domains = allowed_domain_list
start_urls = input_domain_list
max_depth = 1
invalid_url = []
def parse(self, response):
from_url = ''
from_text = ''
depth = 0
# Extract the meta information from the response, if any
if 'text' in response.meta:
from_text = response.meta['text']
if 'depth' in response.meta:
depth = response.meta['depth']
if 'visited' in response.meta:
visited_dict = response.meta['visited']
else:
visited_dict = {}
if response.status == 404:
self.invalid_url.append(response.url)
print('*'*80)
print('INVALID LINK')
print('*'*80)
else:
page = response.url.split("/")[-2]
web_page = response.request.url
ext_text = ' '.join([item.strip() for item in
response.xpath('//body//text()').extract() if item.strip()])
visited = visited_dict.get('{0}'.format(web_page))
print('-'*80)
print('VALID LINK; Depth: {0}; Visited: {1}'.format(depth, visited))
print('-'*80)
yield {'text': ext_text,
'source': web_page}
if not visited and depth <= self.max_depth:
for selector in response.xpath('//a/#href'):
if selector is not None:
link = selector.get()
request = response.follow(link, callback=self.parse)
request.meta['visited'] = visited_dict
request.meta['visited'].update({'{0}'.format(web_page): 1})
request.meta['depth'] = depth + 1
print('*'*80)
print(link, request.meta['visited'])
print('*' * 80)
yield request

Scrapy - ValueError: Missing scheme in request url: #mw-head

I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated

Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]

row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()

Scrapy - unable to make additional request in XMLFeedSpider

I have a scrapy spider that uses XMLFeedSpider. As well as the data returned for each node in parse_node(), I also need to make an additional request to get more data. The only issue, is if I yield an additional request from parse_node() nothing gets returned at all:
class MySpidersSpider(XMLFeedSpider):
name = "myspiders"
namespaces = [('g', 'http://base.google.com/ns/1.0')]
allowed_domains = {"www.myspiders.com"}
start_urls = [
"https://www.myspiders.com/productMap.xml"
]
iterator = 'iternodes'
itertag = 'item'
def parse_node(self, response, node):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
id = node.xpath('id/text()').extract()
title = node.xpath('title/text()').extract()
link = node.xpath('link/text()').extract()
image_link = node.xpath('g:image_link/text()').extract()
gtin = node.xpath('g:gtin/text()').extract()
product_type = node.xpath('g:product_type/text()').extract()
price = node.xpath('g:price/text()').extract()
sale_price = node.xpath('g:sale_price/text()').extract()
availability = node.xpath('g:availability/text()').extract()
item = MySpidersItem()
item['id'] = id[0]
item['title'] = title[0]
item['link'] = link[0]
item['image_link'] = image_link[0]
item['gtin'] = gtin[0]
item['product_type'] = product_type[0]
item['price'] = price[0]
item['sale_price'] = '' if len(sale_price) == 0 else sale_price[0]
item['availability'] = availability[0]
yield Request(item['link'], callback=self.parse_details, meta={'item': item})
def parse_details(self, response):
item = response.meta['item']
item['price_per'] = 'test'
return item
If I change the last line of parse_node() to return item it works fine (without setting price_per in the item, naturally).
Any idea what I'm doing wrong?

Have you tried checking the contents of item['link']? If it is a relative link (example: /products?id=5), the URL won't return anything and the request will fail. You need to make sure it's a resolvable link (example: https://www.myspiders.com/products?id=5).

I discovered the issue - I was limiting the number of items processed in my parse_node() function. However, because of the limit, my spider was terminating prior to the request being made. Moving the code to limit the item processed to my parse_details() function resolves the issue:
def parse_details(self, response):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
item = response.meta['item']
item['price_per'] = 'test'
return item

Stuck with Data Crawling on Scrapy

One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item

Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item

You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

RegEx to get URLs from body string - python

Related

Another Scrapy Question: Output to Console but not to .json

Scrapy - ValueError: Missing scheme in request url: #mw-head

Scrapy - unable to make additional request in XMLFeedSpider

Stuck with Data Crawling on Scrapy

Scrapy (Python): Iterating over 'next' page without multiple functions

Categories

Resources