Can I scrape table from html file in Python?

Can I scrape table from html file in Python? - python

I want to scrape the table from this text file text_file and the table I want is SUMMARY CONSOLIDATED FINANCIAL AND OTHER DATA. The BeautifulSoup.content gives me the code looks like this The Origin Code. My code is attached and can someone tell me where it went wrong?
url = r'https://www.sec.gov/Archives/edgar/data/1181232/000104746903038553/a2123752z424b4.htm'
filing_url = requests.get(url)
content = filing_url.text
soup = BeautifulSoup(content, 'lxml')
tables = soup.find_all(text=re.compile('SUMMARY CONSOLIDATED FINANCIAL AND OTHER DATA'))
n_columns = 0
n_rows = 0
column_names = []
for table in tables:
for row in table.find_next('table').find_all('tr'):
# Determine the number of rows in the table
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows += 1
if n_columns == 0:
# Set the number of columns for the table
n_columns = len(td_tags)
# Handle column names if find them
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0, n_columns)
df = pd.DataFrame(columns=columns,
index=range(0, n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker, column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
print(df)

In this particular case, you could simplify this significantly, using pandas:
import pandas as pd
url = 'https://www.sec.gov/Archives/edgar/data/1181232/000104746903038553/a2123752z424b4.htm'
tables = pd.read_html(url)
#there are more than 100 tables on that page, so you have to narrow it down
targets = []
for t in tables:
if 'Unaudited' in str(t.columns):
targets.append(t)
targets[0] #only two meet that requirement, and the first is your target
Output is your target table.

Related

Python Docx Minimum Table Height

I'm trying to fit 10 rows (and three columns) of a table on one page, howver I'm running into a limitation where I can't get any more than 8 rows to fit. I've tried the following code:
table = document.add_table(rows=0, cols=3)
for row in table.rows:
row.height = Cm(1)
However, at some point when reducing the size,there is no difference in the output. Is it possible to fit 10 rows on one page?
An adapted version of my code, which is iterating through a dataframe and writing columns of my dataframe to cells of a table.
document = Document()
sections = document.sections
for section in sections:
section.top_margin = Inches(0.00)
section.bottom_margin = Inches(0.00)
section.left_margin = Inches(0.00)
section.right_margin = Inches(0.00)
style = document.styles['Normal']
font = style.font
font.size = Pt(8)
table = document.add_table(rows=0, cols=3)
index = 0
full_count = 1
for item_one, item_two,description,max_portion,quantity_adjusted, mods in zip(line_items['title'].tolist(), line_items['quantity'],line_items['description'], line_items['max_portion'],line_items['quantity_adjusted'], line_items['modifications']):
count = 0
if index % 3 == 0:
cell_row = table.add_row()
cell_row.height = Cm(0.1)
row_cells = cell_row.cells
part_one_cell = row_cells[index % 3]
part_one_cell.height = Cm(0.1)
#para = doc.add_paragraph().add_run('GeeksforGeeks is a Computer Science portal for geeks.')
#para.font.size = Pt(12)
p = part_one_cell.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
#p1 = part_one_cell.paragraphs[0].add_run(item_one.upper()+ ' ' + description.upper())
#p1.alignment = WD_ALIGN_PARAGRAPH.CENTER
if len(item_one + description) < 40:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(12)
elif len(item_one + description) < 60:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(10)
else:
p.add_run(item_one.upper()+ ' ' + description.upper()).font.size = Pt(8)
row1 = row_cells[index % 3]
row2= row1.add_paragraph(mods)
row2.alignment = WD_ALIGN_PARAGRAPH.CENTER
row = row_cells[index % 3]
p1 = row.add_paragraph(f'{x[str(quantity_adjusted)]}')
p1.alignment=WD_ALIGN_PARAGRAPH.RIGHT
#part_one_cell.paragraphs[0].add_run(f'{x[str(item_two)]}')
#part_one_cell.paragraphs[0].add_run(f' {str(x)}').bold= True
index = index + 1
full_count = full_count + 1
if full_count % 30 == 0:
document.add_page_break()
table = document.add_table(rows=0, cols=3)

I have no problem getting 10 1cm rows in a single page. I declare the number of rows when adding the table:
from docx import Document
from docx.shared import Cm
document = Document()
table = document.add_table(rows=10, cols=3)
table.style = 'Table Grid'
for row in table.rows:
row.height = Cm(1)
document.save('demo.docx')
To add rows in a for loop:
table = document.add_table(rows=0, cols=3)
table.style = 'Table Grid'
for i in range(10):
row = table.add_row()
row.height = Cm(1)
document.save('demo.docx')

Python web scraper not getting certain values

I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.

I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)

Handling exception with list out of range

I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.

Fastest way to create dataframe from hundreds csv files

I have 150 csv files with two column (time and site). I want to read each file, creating frequency dictionary ({'site':[site_number, number of occurrences site]}) and creating DataFrame consists of 11 columns (user_id, site1, site2, ...site10), user_id parsing from name of file (../user0001.csv). Each row in DataFrame unique session of 10 site visits. My codes worked on 150 files 150 seconds (its terrible). How can i improve it?
def prepare_3(path_to_csv_files, session_length=10):
word_freq = {}
freq_dict = {}
word_count = 0
row = []
columns = []
columns.append('user_id')
columns.extend(['site' + str(i) for i in range(1, session_length+1)])
lst_files = sorted(glob(path_to_csv_files))
for csv in lst_files:
user = int(csv[csv.find('.')-4:csv.find('.')])
frame = []
frame.append(user)
site_count = 0
with open(csv, 'r') as f:
f.readline()
for line in f:
site = line[line.find(',') + 1:].rstrip()
site_count += 1
if site in word_freq:
word_freq[site][1] += 1
else:
word_count += 1
word_freq[site] = [word_count, 1]
if site_count > session_length:
site_count = 1
row.append(frame)
frame = []
frame.append(user)
frame.append(word_freq[site][0])
else:
frame.append(word_freq[site][0])
row.append(frame)
df = pd.DataFrame(data=row, columns=columns, dtype=int)
df.fillna(0 ,inplace=True)
return df, word_freq

How do I get the calculated column names from a panda pivot table

I am trying to create an excel spreadsheet from a pandas.pivot_table.
I don't want to use to_excel as it:
Renders at half the speed
Doesn't allow cell formatting
Doesn't compute cell widths
Doesn't allow me to create other cells with formulas
etc.
create a dataframe from a list of lists
convert to a pivot table with pivot_table
convert to records with to_records
Now I can create a worksheet, but I need the column headers for the indexes, which I can save as they were specified when creating the pivot table, but how do I get the inferred column_names deduced from the distinct values?
import pandas
import datetime
import logging
import math
import random
from dateutil.relativedelta import relativedelta
import xlsxwriter
logging.basicConfig(level=logging.INFO)
def get_sales_data(*, num_records, num_custs, num_products, date_from, number_of_months):
print("number of months %s" % number_of_months)
sales = {}
rows = []
for cust_nbr in range(0, num_custs):
cust_name = "cust " + "{0:0>3}".format(cust_nbr)
for month_delta in range(0, number_of_months):
ship_date = date_from + relativedelta(months=month_delta)
for product_nbr in (0, num_products):
product = "product " + str(product_nbr)
qty = random.randint(0, 20)
if (qty > 0):
key = (cust_name, product, ship_date)
shipment = (cust_name, product, ship_date, qty)
sales[key] = shipment
for shipment in sales.values():
rows.append(shipment)
return rows
def to_excel(workbook, sheetname, dataframe):
worksheet = workbook.add_worksheet(sheetname)
row_index = 1
max_widths = [None] * len(dataframe[0])
for row in dataframe:
#max_widths = [None] * len(row)
col_index = 0
for datum in row:
if datum is not None:
if isinstance(datum, float):
if not math.isnan(datum):
worksheet.write(row_index, col_index, datum)
else:
worksheet.write(row_index, col_index, datum)
# print ("len(max_widths) %s col_index %s " % (len(max_widths),col_index))
if max_widths[col_index] is None or len(str(datum)) > max_widths[col_index]:
max_widths[col_index] = len(str(datum))
# if row_index < 5:
# print("r: %s c: %s %s" % (row_index, col_index, datum))
col_index += 1
row_index += 1
col_index = 0
for width in max_widths:
worksheet.set_column(col_index, col_index, width + 1)
col_index += 1
# Get a List of Lists
from_date = datetime.date(2015, 1, 1)
to_date = datetime.date(2017, 7, 1)
matrix = get_sales_data(num_records=3000, num_products=3, date_from=from_date, number_of_months=30, num_custs=1000)
# Get a dataframe
labels = ["cust_name", "product", "ship_date", "qty"]
dataframe = pandas.DataFrame.from_records(matrix, columns=labels)
print(dataframe)
# get pivot
pivot_table = pandas.pivot_table(dataframe, columns='ship_date', values='qty', index=['cust_name', 'product'])
print(pivot_table)
# convert back to records
records = pivot_table.to_records()
# get excel
output = open("/tmp/crosstab.xslx", "wb")
workbook = xlsxwriter.Workbook(output)
to_excel(workbook, "Cust Product By Month", records)
workbook.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Can I scrape table from html file in Python? - python

Related

Python Docx Minimum Table Height

Python web scraper not getting certain values

Handling exception with list out of range

Fastest way to create dataframe from hundreds csv files

How do I get the calculated column names from a panda pivot table

Categories

Resources