One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item
Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()
Related
please anyone one help me. I'm trying to navigate a pages but I don't know my code is not working. I got a product details for the 1st page I want to scrape the details for all the pages in the website. below is my code please check for your reference. thanks in advance
below is link for the website
https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.
How do you iterate through each sub text (fighter) to get the data i need and then leave the sub link to go back to the page with all the fighters names on it and then iterate to the next fighter(link) and get all the data on that fighter and keep doing that until it gets to the end of the list on that specific page.
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
records.append([name,country])
The above code is where all the fighters names are located. I am able to iterate over each one to collect the (fighters name and country)
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])][1]
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
weightClass = data1.find('li',attrs={'class':'first'}).text
trainingCenter = data1.find('li',attrs={'class':'last'}).text
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
The above code is currently entering into the second fighter and collecting all the data for that specific fighter(link).
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])]
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
for l in links:
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
for b in bio:
try:
weightClass = data1.find('li',attrs={'class':'first'}).text
except AttributeError: name = ""
try:
trainingCenter = data1.find('li',attrs={'class':'last'}).text
except AttributeError: name = ""
try:
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: name = ""
try:
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: name = ""
try:
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: name = ""
try:
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: name = ""
try:
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: name = ""
records.append([name,country,weightClass])
The above code is what i am trying, but i am getting an error message:
"ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?"
How do i add this to the initial code i have so, i can collect the fighters name and country on the original page and then iterate into the fighters(link) and gather the data you see above and then have it do it for all fighters on that page?
Check out this solution. I don't have much time at this moment but I will check around as soon as I'm free. You can do the main operation using the following code. The only thing you ned to do is get the data from the target page. The below script can fetch you all the links from each page going through the pagination (a to z) and then from the target page it will collect you the names.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.espn.com/mma/fighters?search={}"
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
for links in soup.select(".tablehead a[href*='id']"):
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
title = sauce.select_one(".player-bio h1").text
print(title)
import requests, re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.espn.com/mma/fighters?search={}"
titleList = []
countryList = []
stanceList = []
reachList = []
ageList = []
weightClassList = []
trainingCenterList = []
winsList = []
losesList =[]
drawsList = []
tkosList = []
subsList = []
#i believe this is what takes us from one page to another, but not 100% sure yet
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
#a[href*=] gets all anchors a that contain whatever the href*=''
for links in soup.select(".tablehead a[href*='id']"):
#urljoin just takes a url and another string and combines them to create a new url
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
try:
title = sauce.select_one(".player-bio h1").text
except AttributeError: title = ""
try:
country = sauce.find('span',text='Country').next_sibling
except AttributeError: country = ""
try:
stance = sauce.find('span',text='Stance').next_sibling
except AttributeError: stance = ""
try:
reach = sauce.find('span',text='Reach').next_sibling
except AttributeError: reach = ""
try:
age = sauce.find('span',text='Birth Date').next_sibling[-3:-1]
except AttributeError: age = ""
try:
weightClass = sauce.find('li',attrs={'class':'first'}).text
except AttributeError: weightClass = ""
try:
trainingCenter = sauce.find('li',attrs={'class':'last'}).text
except AttributeError: trainingCenter = ""
try:
wins = sauce.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: wins = ""
try:
loses = sauce.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: loses = ""
try:
draws = sauce.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: draws = ""
try:
tkos = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: tkos = ""
try:
subs = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: subs = ""
titleList.append(title)
countryList.append(country)
stanceList.append(stance)
reachList.append(reach)
ageList.append(age)
weightClassList.append(weightClass)
trainingCenterList.append(trainingCenter)
winsList.append(wins)
losesList.append(loses)
drawsList.append(draws)
tkosList.append(tkos)
subsList.append(subs)
df = pd.DataFrame()
df['title'] = titleList
df['country'] = countryList
df['stance'] = stanceList
df['reach'] = reachList
df['age'] = ageList
df['weightClass'] = weightClassList
df['trainingCenter']= trainingCenterList
df['wins'] = winsList
df['loses'] = losesList
df['draws'] = drawsList
df['tkos'] = tkosList
df['subs'] = subsList
df.to_csv('MMA Fighters', encoding='utf-8')
Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`
Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')
So I was just wondering what my getURLs function's issue might be. I'm trying to get all urls from within the containing body's string.
My crawler isn't crawling anything because my input urls are invalid.
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(#class,'cat')]/a/#href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(#class,'cat')]/a/#href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[#class="pagination next"]/#href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[#class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[#class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[#class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[#id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
The web page is: http://grandisland.backpage.com/FemaleEscorts/?layout=date