How to iterate through each sub link to gather data

How to iterate through each sub link to gather data - python

How do you iterate through each sub text (fighter) to get the data i need and then leave the sub link to go back to the page with all the fighters names on it and then iterate to the next fighter(link) and get all the data on that fighter and keep doing that until it gets to the end of the list on that specific page.
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
records.append([name,country])
The above code is where all the fighters names are located. I am able to iterate over each one to collect the (fighters name and country)
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])][1]
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
weightClass = data1.find('li',attrs={'class':'first'}).text
trainingCenter = data1.find('li',attrs={'class':'last'}).text
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
The above code is currently entering into the second fighter and collecting all the data for that specific fighter(link).
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])]
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
for l in links:
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
for b in bio:
try:
weightClass = data1.find('li',attrs={'class':'first'}).text
except AttributeError: name = ""
try:
trainingCenter = data1.find('li',attrs={'class':'last'}).text
except AttributeError: name = ""
try:
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: name = ""
try:
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: name = ""
try:
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: name = ""
try:
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: name = ""
try:
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: name = ""
records.append([name,country,weightClass])
The above code is what i am trying, but i am getting an error message:
"ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?"
How do i add this to the initial code i have so, i can collect the fighters name and country on the original page and then iterate into the fighters(link) and gather the data you see above and then have it do it for all fighters on that page?

Check out this solution. I don't have much time at this moment but I will check around as soon as I'm free. You can do the main operation using the following code. The only thing you ned to do is get the data from the target page. The below script can fetch you all the links from each page going through the pagination (a to z) and then from the target page it will collect you the names.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.espn.com/mma/fighters?search={}"
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
for links in soup.select(".tablehead a[href*='id']"):
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
title = sauce.select_one(".player-bio h1").text
print(title)

import requests, re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.espn.com/mma/fighters?search={}"
titleList = []
countryList = []
stanceList = []
reachList = []
ageList = []
weightClassList = []
trainingCenterList = []
winsList = []
losesList =[]
drawsList = []
tkosList = []
subsList = []
#i believe this is what takes us from one page to another, but not 100% sure yet
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
#a[href*=] gets all anchors a that contain whatever the href*=''
for links in soup.select(".tablehead a[href*='id']"):
#urljoin just takes a url and another string and combines them to create a new url
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
try:
title = sauce.select_one(".player-bio h1").text
except AttributeError: title = ""
try:
country = sauce.find('span',text='Country').next_sibling
except AttributeError: country = ""
try:
stance = sauce.find('span',text='Stance').next_sibling
except AttributeError: stance = ""
try:
reach = sauce.find('span',text='Reach').next_sibling
except AttributeError: reach = ""
try:
age = sauce.find('span',text='Birth Date').next_sibling[-3:-1]
except AttributeError: age = ""
try:
weightClass = sauce.find('li',attrs={'class':'first'}).text
except AttributeError: weightClass = ""
try:
trainingCenter = sauce.find('li',attrs={'class':'last'}).text
except AttributeError: trainingCenter = ""
try:
wins = sauce.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: wins = ""
try:
loses = sauce.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: loses = ""
try:
draws = sauce.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: draws = ""
try:
tkos = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: tkos = ""
try:
subs = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: subs = ""
titleList.append(title)
countryList.append(country)
stanceList.append(stance)
reachList.append(reach)
ageList.append(age)
weightClassList.append(weightClass)
trainingCenterList.append(trainingCenter)
winsList.append(wins)
losesList.append(loses)
drawsList.append(draws)
tkosList.append(tkos)
subsList.append(subs)
df = pd.DataFrame()
df['title'] = titleList
df['country'] = countryList
df['stance'] = stanceList
df['reach'] = reachList
df['age'] = ageList
df['weightClass'] = weightClassList
df['trainingCenter']= trainingCenterList
df['wins'] = winsList
df['loses'] = losesList
df['draws'] = drawsList
df['tkos'] = tkosList
df['subs'] = subsList
df.to_csv('MMA Fighters', encoding='utf-8')

Related

I have been trying to create a csv file from data recieved from a web scraper

as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)

Python web scraping problem. 'NoneType' object has no attribute 'head'

When I run my code I get this error:
'NoneType' object has no attribute 'head'
I don't know why. I want to get a result like this picture. Could you help me?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def getReviews(page_url):
url = 'https://www.rottentomatoes.com/m/soul_2020/reviews?type=top_critics'
page = requests.get(url)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
reviews= None
for d in soup.findAll('div', attrs = {'class':'tabs__link js-tab-link js-tab-tracker'}):
reviewer = d.find('a', attrs = {'class':'unstyled bold articleLink'})
source = d.find('em', attrs = {'class':'subtle critic-publication'})
content = d.find('div', attrs ={'class':'the_review'})
date = d.find('div', attrs = {'class':'review-date subtle small'})
score = d.find('div', attrs = {'class':'small subtle review-link'})
all1 = []
#reviewer
if reviewer is not None:
all1.append(reviewer.text)
elif reviewer is None:
reviewer = d.find('a', attrs={'class':'unstyled bold articleLink'})
if reviewer is not None:
all1.append(reviewer.text)
else:
all1.append(None)
#source
if source is not None:
all1.append(source.text)
else:
all1.append(None)
#content
if content is not None:
all1.append(content.text)
else:
all.append(None)
#date
if date is not None:
all1.append(date.text)
else:
all1.append(None)
#score
if score is not None:
all1.append(score.text)
else:
all1.append(None)
reviews.append(all1)
return reviews
url = 'https://www.rottentomatoes.com/m/soul_2020/reviews?type=top_critics'
reviews=getReviews(url)
print(reviews.head(5))

AttributeError when webscraping

Received AttributeError when web-scraping but i am unsure what i a doing wrong? what does AttributeError mean?
response_obj = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(response_obj,'lxml')
Population_Census_Table = soup.find('table', {'class':'wikitable sortable'})
preparation of the table
rows = Population_Census_Table.select("tbody > tr")[3:8]
jurisdiction = []
for row in rows:
jurisdiction = {}
tds = row.select('td')
jurisdiction["jurisdiction"] = tds[0].text.strip()
jurisdiction["population_census"] = tds[1].text.strip()
jurisdiction["%_white"] = float(tds[2].text.strip().replace(",",""))
jurisdiction["%_black_or_african_amercian"] = float(tds[3].text.strip().replace(",",""))
jurisdiction["%_Asian"] = float(tds[4].text.strip().replace(",",""))
jurisdiction["%_other"] = float(tds[5].text.strip().replace(",",""))
jurisdiction["%_mixed_race"] = float(tds[6].text.strip().replace(",",""))
jurisdiction["%_hispanic_latino_of_other_race"] = float(tds[7].text.strip().replace(",",""))
jurisdiction["%_catholic"] = float(tds[7].text.strip().replace(",",""))
jurisdiction["%_jewish"] = float(tds[8].text.strip().replace(",",""))
jurisdiction.append(jurisdiction)
` `print(jurisdiction)
AttributeError
---> 18 jurisdiction.append(jurisdiction)
AttributeError: 'dict' object has no attribute 'append'

You start with jurisdiction as a list and immediately make it as a dict. You then treat as a dict until the error line where you try to treat it again as a list. I think you need another name for the list at the start. Possibly you meant jurisdictions (plural) as list. However, IMO there are two other areas that also definitely need fixing:
find returns a single table. The labels/keys in your dict indicate you want to a later table (not the first match)
Your indexing is incorrect for the target table
You want something like:
import requests, re
from bs4 import BeautifulSoup
response_obj = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(response_obj,'lxml')
Population_Census_Table = soup.select_one('.wikitable:nth-of-type(5)') #use css selector to target correct table.
jurisdictions = []
rows = Population_Census_Table.select("tbody > tr")[3:8]
for row in rows:
jurisdiction = {}
tds = row.select('td')
jurisdiction["jurisdiction"] = tds[0].text.strip()
jurisdiction["population_census"] = tds[1].text.strip()
jurisdiction["%_white"] = float(tds[2].text.strip().replace(",",""))
jurisdiction["%_black_or_african_amercian"] = float(tds[3].text.strip().replace(",",""))
jurisdiction["%_Asian"] = float(tds[4].text.strip().replace(",",""))
jurisdiction["%_other"] = float(tds[5].text.strip().replace(",",""))
jurisdiction["%_mixed_race"] = float(tds[6].text.strip().replace(",",""))
jurisdiction["%_hispanic_latino_of_other_race"] = float(tds[7].text.strip().replace(",",""))
jurisdiction["%_catholic"] = float(tds[10].text.strip().replace(",",""))
jurisdiction["%_jewish"] = float(tds[12].text.strip().replace(",",""))
jurisdictions.append(jurisdiction)

While running python code program flow gets stuck in try block

Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.

Stuck with Data Crawling on Scrapy

One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item

Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to iterate through each sub link to gather data - python

Related

I have been trying to create a csv file from data recieved from a web scraper

Python web scraping problem. 'NoneType' object has no attribute 'head'

AttributeError when webscraping

While running python code program flow gets stuck in try block

Stuck with Data Crawling on Scrapy

Categories

Resources