How to exclude stocks based on technicals on python - python

I have a code that gives me the technicals of stocks from yahoo, no problem with that, but I am trying to get the program to not print stocks if they do not meet requirements, for example, if revenue is not greater than 100B.
I have tried an if statement at various parts of this code, none seem to work.
technicals = {}
try:
url = ('http://finance.yahoo.com/q/ks?s='+stock)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
tables = soup.findAll('table', {"class" : 'table-qsp-stats'}) # Found using page inspection
for table in tables:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
col_name = row.find_all('span') # Use span to avoid supscripts
col_name = [cell.text.strip() for cell in col_name]
col_val = row.find_all('td')
col_val = [cell.text.strip() for cell in col_val]
technicals[col_name[0]] = col_val[1] # col_val[0] is the name cell (with subscript)
return technicals
except Exception as e:
print('Failed, exception: ', str(e))
def scrape(stock_list, interested, technicals):
for each_stock in stock_list:
technicals = scrape_yahoo(each_stock)
if int('Revenue') > 100000000000:
print(each_stock)
for ind in interested:
print(ind + ": "+ technicals[ind])
print("------")
time.sleep(1) # Use delay to avoid getting flagged as bot
return technicals
def main():
stock_list = ['aapl', 'tsla', 'ge']
interested = ['Market Cap (intraday)', 'Return on Equity', 'Revenue', 'Quarterly Revenue Growth']
technicals = {}
tech = scrape(stock_list, interested, technicals)
print(tech)
main()
ValueError: invalid literal for int() with base 10: 'Revenue'

i assume that technical variable is a dict and it has Revenue key.
you should change from
if int('Revenue')
to
if int(technical.get('Revenue',0))

import time
import urllib.request
from bs4 import BeautifulSoup
def scrape_yahoo(stock):
technicals = {}
try:
url = ('http://finance.yahoo.com/q/ks?s= ' +stock)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
tables = soup.findAll('table', {"class" : 'table-qsp-stats'}) # Found using page inspection
for table in tables:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
col_name = row.find_all('span') # Use span to avoid supscripts
col_name = [cell.text.strip() for cell in col_name]
col_val = row.find_all('td')
col_val = [cell.text.strip() for cell in col_val]
technicals[col_name[0]] = col_val[1] # col_val[0] is the name cell (with subscript)
return technicals
except Exception as e:
print('Failed, exception: ', str(e))
def scrape(stock_list, interested, technicals):
for each_stock in stock_list:
technicals = scrape_yahoo(each_stock)
if (float(technicals.get('Revenue',0).replace("B","")))*1000000000 > 100000000000:
print(each_stock)
for ind in interested:
print(ind + ": "+ technicals[ind])
print("------")
time.sleep(1) # Use delay to avoid getting flagged as bot
return technicals
def main():
stock_list = ['aapl', 'tsla', 'ge']
interested = ['Market Cap (intraday)', 'Return on Equity', 'Revenue', 'Quarterly Revenue Growth']
technicals = {}
tech = scrape(stock_list, interested, technicals)
print(tech)
main()

Related

Issues Scraping multiple webpages with BeautifulSoup

I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...

I have been trying to create a csv file from data recieved from a web scraper

as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)

Python KeyError in web scraper

I am getting a KeyError: 'title' error in my web scraping program and not sure what the issue is. When I use inspect element on the webpage I can see the element that I am trying to find;
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
odd_team_win = data[1].find_all('img')[-1]['title']
sim_team_win = data[2].find('img')['title']
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = sim_margin - odd_margin
else:
diff = -1 * odd_margin - sim_margin
else:
diff = '-'
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
I am getting the error on setting the sim_team_win line. It is getting data[2] which is the 3rd column on the website and finding the img title to get the team name. Is it because the img title is within another div? Also, when running this code it also does not print out the "Odds" column, which is being stored in the odd_margin variable. Is there something that is wrong when setting that variable? Thanks in advance for the help!
As far as the not finding the img title, if you look at the row with New Mexico # Dixie State, there is no image in the third column - no img title in the source either.
For the Odds column, after try/excepting the sim_team_win assignment, I get all the Odds values in the table.

Python HTML Parser (Unnamed Level)

I am working on a screen scraper to pull football statistics down from www.pro-football-reference.com. I'm currently scraping from the main player's stat page and then diving into their individual page with their statistics by year.
I was able to implement this process successfully with my first set of players (quarterbacks, using the Passing Table). However, when I attempted to re-create the process to get running back data, I am reciving an additional column in my data frame with the values "Unnamed: x_level_0". This is my first experience with HTML data so I'm not sure what piece I missed, I just assumed it would be the same code as the quarterbacks.
Below is the QB Code sample and the correct dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Passing Data
r = requests.get(url+ '/years/' + str(year) + '/passing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_passing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Philip Rivers
Year 2020
Age 39
Tm IND
Pos qb
No. 17
G 1
GS 1
Below is the RB Code sample and the incorrect dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Rushing Data
r = requests.get(url+ '/years/' + str(year) + '/rushing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_rushing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
print(stub)
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
print(v)
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
print('found 2020')
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Unnamed: 0_level_0 Year 2020
Unnamed: 1_level_0 Age 26
Unnamed: 2_level_0 Tm TEN
Unnamed: 3_level_0 Pos rb
Unnamed: 4_level_0 No. 22
Games G 1
GS 1
Rushing Rush 31
Yds 116
TD 0
An example URL where this data is pulled from is: https://www.pro-football-reference.com/players/J/JacoJo01.htm
And it is pulling the Rushing & Receiving. Is there something additional I need to be on the lookout for when it comes to parsing HTML?
I attempted to add index_col = 1 into my tdf = pd.read_html(url + stub)[1]. However, that just kind of grouped the two values into one column.
Any input on this would be greatly appreciated. If I can provide any further information, please let me know.
Thank you
You can try this code to parse the table passing for each player (Now I get the players from https://www.pro-football-reference.com/years/2020/passing.htm but you can pass any player URL to it:
import requests
from bs4 import BeautifulSoup
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
row = soup.select_one('table#passing tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this csv:
EDIT: To parse Rushing&Receiving, you can use this script:
import requests
from bs4 import BeautifulSoup, Comment
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
soup = BeautifulSoup(soup.select_one('#rushing_and_receiving_link').find_next(text=lambda t: isinstance(t, Comment)), 'html.parser')
row = soup.select_one('table#rushing_and_receiving tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('tr')[-1].select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this CSV:

How to iterate through each sub link to gather data

How do you iterate through each sub text (fighter) to get the data i need and then leave the sub link to go back to the page with all the fighters names on it and then iterate to the next fighter(link) and get all the data on that fighter and keep doing that until it gets to the end of the list on that specific page.
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
records.append([name,country])
The above code is where all the fighters names are located. I am able to iterate over each one to collect the (fighters name and country)
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])][1]
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
weightClass = data1.find('li',attrs={'class':'first'}).text
trainingCenter = data1.find('li',attrs={'class':'last'}).text
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
The above code is currently entering into the second fighter and collecting all the data for that specific fighter(link).
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])]
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
for l in links:
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
for b in bio:
try:
weightClass = data1.find('li',attrs={'class':'first'}).text
except AttributeError: name = ""
try:
trainingCenter = data1.find('li',attrs={'class':'last'}).text
except AttributeError: name = ""
try:
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: name = ""
try:
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: name = ""
try:
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: name = ""
try:
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: name = ""
try:
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: name = ""
records.append([name,country,weightClass])
The above code is what i am trying, but i am getting an error message:
"ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?"
How do i add this to the initial code i have so, i can collect the fighters name and country on the original page and then iterate into the fighters(link) and gather the data you see above and then have it do it for all fighters on that page?
Check out this solution. I don't have much time at this moment but I will check around as soon as I'm free. You can do the main operation using the following code. The only thing you ned to do is get the data from the target page. The below script can fetch you all the links from each page going through the pagination (a to z) and then from the target page it will collect you the names.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.espn.com/mma/fighters?search={}"
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
for links in soup.select(".tablehead a[href*='id']"):
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
title = sauce.select_one(".player-bio h1").text
print(title)
import requests, re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.espn.com/mma/fighters?search={}"
titleList = []
countryList = []
stanceList = []
reachList = []
ageList = []
weightClassList = []
trainingCenterList = []
winsList = []
losesList =[]
drawsList = []
tkosList = []
subsList = []
#i believe this is what takes us from one page to another, but not 100% sure yet
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
#a[href*=] gets all anchors a that contain whatever the href*=''
for links in soup.select(".tablehead a[href*='id']"):
#urljoin just takes a url and another string and combines them to create a new url
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
try:
title = sauce.select_one(".player-bio h1").text
except AttributeError: title = ""
try:
country = sauce.find('span',text='Country').next_sibling
except AttributeError: country = ""
try:
stance = sauce.find('span',text='Stance').next_sibling
except AttributeError: stance = ""
try:
reach = sauce.find('span',text='Reach').next_sibling
except AttributeError: reach = ""
try:
age = sauce.find('span',text='Birth Date').next_sibling[-3:-1]
except AttributeError: age = ""
try:
weightClass = sauce.find('li',attrs={'class':'first'}).text
except AttributeError: weightClass = ""
try:
trainingCenter = sauce.find('li',attrs={'class':'last'}).text
except AttributeError: trainingCenter = ""
try:
wins = sauce.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: wins = ""
try:
loses = sauce.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: loses = ""
try:
draws = sauce.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: draws = ""
try:
tkos = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: tkos = ""
try:
subs = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: subs = ""
titleList.append(title)
countryList.append(country)
stanceList.append(stance)
reachList.append(reach)
ageList.append(age)
weightClassList.append(weightClass)
trainingCenterList.append(trainingCenter)
winsList.append(wins)
losesList.append(loses)
drawsList.append(draws)
tkosList.append(tkos)
subsList.append(subs)
df = pd.DataFrame()
df['title'] = titleList
df['country'] = countryList
df['stance'] = stanceList
df['reach'] = reachList
df['age'] = ageList
df['weightClass'] = weightClassList
df['trainingCenter']= trainingCenterList
df['wins'] = winsList
df['loses'] = losesList
df['draws'] = drawsList
df['tkos'] = tkosList
df['subs'] = subsList
df.to_csv('MMA Fighters', encoding='utf-8')

Categories

Resources