I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.
Related
I have Dataset like this:
ORDER_CODE
ITEM_ID
ITEM_NAME
TOTALPRICE
123
id1
name1
345
321
id2
name2
678
and Function for calculation which items was sold together. Which ones was most popular or more expensive
out:
ITEM_ID
sold together
id1
[ id33, id23, id12 ]
id2
[ id56, id663 ]
I using this Func:
def freq(df):
hit_list = [list of ID's]
result = pd.DataFrame(columns = ['ITEM_ID', 'sold together'])
unic_arc = df['ITEM_ID'].unique()
unic_num = df['ORDER_CODE'].unique()
data_arc ={}
data_num={}
for i in unic_arc:
data_arc[i] = {}
tturns = response_ur[['ITEM_ID', 'TOTALPRICE']].groupby(by = 'ITEM_ID', as_index = False).sum()
tturns = tturns.rename(columns = {'ITEM_ID' : 'inum', 'TOTALPRICE' : 'turn'})
for i in tqdm(unic_arc):
b = df[df['ITEM_ID'] == i]['ORDER_CODE'].values
for t in b:
a = df[df['ORDER_CODE'] == t]['ID'].values
if i in a:
for arc in a:
if int(arc) not in hit_list:
if arc != i:
if arc in data_arc[i]:
data_arc[i][arc]+=1
else:
data_arc[i][arc] = 1
dd = data_arc[i]
tmp = pd.DataFrame(columns = ['inum', 'freq'])
tmp['inum'] = data_arc[i].keys()
tmp['freq'] = data_arc[i].values()
tmp['inum'] = tmp['inum'].astype(str)
tturns['inum'] = tturns['inum'].astype(str)
tmp = pd.merge(tmp, tturns, on = 'inum', how = 'inner')
tmp = tmp.sort_values(by = ['freq', 'turn'], ascending = False)
if len(tmp['inum'].values) > 14:
inums = str(tmp['inum'].values[0:15]).replace("\n", "").replace(' ', ',').replace('\'', '')
else:
inums = str(tmp['inum'].values).replace("\n", "").replace(' ', ',').replace('\'', '')
result = res.append({'inum' : i, 'recs' : inums}, ignore_index = True)
return(result)
I try to add merge 1for addint ITEM_NAME in Func on any iteration, but it so long. My dataset have about 10kk rows
I need add to my output one more column with list of 'ITEM_NAME' of 'sold together' list items. And calc it fast?
UPD:
Here's what is needed:
item_id
list_of items
list_of_names
sum
id_01
[id, id, id, id]
[name, name....]
num
Where list_of items - 'list of most common' items, which were purchased with 'item_id'
This might do it:
import pandas as pd
df = pd.DataFrame( {
'ORDER_CODE':['123','321','123','123','321','555'],
'ITEM_ID':[1,2,5,5,4,6],
'ITEM_NAME':['name1','name2','name3','name4','name5','name6'],
'TOTALPRICE':[10,20,50,50,40,60]}
)
result = df.groupby("ORDER_CODE").agg({"ITEM_ID":list, "ITEM_NAME":list, "TOTALPRICE":"sum"})
Further good answer how to create a list in a group by aggregation:
I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")
I'm trying to fit 10 rows (and three columns) of a table on one page, howver I'm running into a limitation where I can't get any more than 8 rows to fit. I've tried the following code:
table = document.add_table(rows=0, cols=3)
for row in table.rows:
row.height = Cm(1)
However, at some point when reducing the size,there is no difference in the output. Is it possible to fit 10 rows on one page?
An adapted version of my code, which is iterating through a dataframe and writing columns of my dataframe to cells of a table.
document = Document()
sections = document.sections
for section in sections:
section.top_margin = Inches(0.00)
section.bottom_margin = Inches(0.00)
section.left_margin = Inches(0.00)
section.right_margin = Inches(0.00)
style = document.styles['Normal']
font = style.font
font.size = Pt(8)
table = document.add_table(rows=0, cols=3)
index = 0
full_count = 1
for item_one, item_two,description,max_portion,quantity_adjusted, mods in zip(line_items['title'].tolist(), line_items['quantity'],line_items['description'], line_items['max_portion'],line_items['quantity_adjusted'], line_items['modifications']):
count = 0
if index % 3 == 0:
cell_row = table.add_row()
cell_row.height = Cm(0.1)
row_cells = cell_row.cells
part_one_cell = row_cells[index % 3]
part_one_cell.height = Cm(0.1)
#para = doc.add_paragraph().add_run('GeeksforGeeks is a Computer Science portal for geeks.')
#para.font.size = Pt(12)
p = part_one_cell.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
#p1 = part_one_cell.paragraphs[0].add_run(item_one.upper()+ ' ' + description.upper())
#p1.alignment = WD_ALIGN_PARAGRAPH.CENTER
if len(item_one + description) < 40:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(12)
elif len(item_one + description) < 60:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(10)
else:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(8)
row1 = row_cells[index % 3]
row2= row1.add_paragraph(mods)
row2.alignment = WD_ALIGN_PARAGRAPH.CENTER
row = row_cells[index % 3]
p1 = row.add_paragraph(f'{x[str(quantity_adjusted)]}')
p1.alignment=WD_ALIGN_PARAGRAPH.RIGHT
#part_one_cell.paragraphs[0].add_run(f'{x[str(item_two)]}')
#part_one_cell.paragraphs[0].add_run(f' {str(x)}').bold= True
index = index + 1
full_count = full_count + 1
if full_count % 30 == 0:
document.add_page_break()
table = document.add_table(rows=0, cols=3)
I have no problem getting 10 1cm rows in a single page. I declare the number of rows when adding the table:
from docx import Document
from docx.shared import Cm
document = Document()
table = document.add_table(rows=10, cols=3)
table.style = 'Table Grid'
for row in table.rows:
row.height = Cm(1)
document.save('demo.docx')
To add rows in a for loop:
table = document.add_table(rows=0, cols=3)
table.style = 'Table Grid'
for i in range(10):
row = table.add_row()
row.height = Cm(1)
document.save('demo.docx')
I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.
I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')
As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers