Splitting HTML text by <br> while using beautifulsoup - python

HTML code:
<td> <label class="identifier">Speed (avg./max):</label> </td> <td class="value"> <span class="block">4.5 kn<br>7.1 kn</span> </td>
I need to get values 4.5 kn and 7.1 as separate list items so I could append them separately. I do not want to split it I wanted to split the text string using re.sub, but it does not work. I tried too use replace to replace br, but it did not work. Can anybody provide any insight?
Python code:
def NameSearch(shipLink, mmsi, shipName):
from bs4 import BeautifulSoup
import urllib2
import csv
import re
values = []
values.append(mmsi)
values.append(shipName)
regex = re.compile(r'[\n\r\t]')
i = 0
with open('Ship_indexname.csv', 'wb')as f:
writer = csv.writer(f)
while True:
try:
shipPage = urllib2.urlopen(shipLink, timeout=5)
except urllib2.URLError:
continue
except:
continue
break
soup = BeautifulSoup(shipPage, "html.parser") # Read the web page HTML
#soup.find('br').replaceWith(' ')
#for br in soup('br'):
#br.extract()
table = soup.find_all("table", {"id": "vessel-related"}) # Finds table with class table1
for mytable in table: #Loops tables with class table1
table_body = mytable.find_all('tbody') #Finds tbody section in table
for body in table_body:
rows = body.find_all('tr') #Finds all rows
for tr in rows: #Loops rows
cols = tr.find_all('td') #Finds the columns
for td in cols: #Loops the columns
checker = td.text.encode('ascii', 'ignore')
check = regex.sub('', checker)
if check == ' Speed (avg./max): ':
i = 1
elif i == 1:
print td.text
pat=re.compile('<br\s*/>')
print pat.sub(" ",td.text)
values.append(td.text.strip("\n").encode('utf-8')) #Takes the second columns value and assigns it to a list called Values
i = 0
#print values
return values
NameSearch('https://www.fleetmon.com/vessels/kind-of-magic_0_3478642/','230034570','KIND OF MAGIC')

Locate the "Speed (avg./max)" label first and then go to the value via .find_next():
from bs4 import BeautifulSoup
data = '<td> <label class="identifier">Speed (avg./max):</label> </td> <td class="value"> <span class="block">4.5 kn<br>7.1 kn</span> </td>'
soup = BeautifulSoup(data, "html.parser")
label = soup.find("label", class_="identifier", text="Speed (avg./max):")
value = label.find_next("td", class_="value").get_text(strip=True)
print(value) # prints 4.5 kn7.1 kn
Now, you can extract the actual numbers from the string:
import re
speed_values = re.findall(r"([0-9.]+) kn", value)
print(speed_values)
Prints ['4.5', '7.1'].
You can then further convert the values to floats and unpack into separate variables:
avg_speed, max_speed = map(float, speed_values)

Related

How to get numbers from html?

I want to get
the number after: page=
the number after: "new">
the number after: /a>-
<td> </td>
<td> qwqwqwqwqw <br/> qwqwqwqwqw 4449-4450<br/> </td>
<td> </td>
<td> qwqwqwqwqw <br/> qwqwqwqwqw 5111-5550<br/> </td>
<td> </td>
...
My code
tables = soup.find_all('a', attrs={'target': 'new'})
gives my only a list (see below) without the third number
[4449,
5111,
...]
her is how i would try to extract the 3 numbers from my list, once it has the third digit in it.
list_of_number1 = []
list_of_number2 = []
list_of_number3 = []
regex = re.compile("page=(\d+)")
for table in tables:
number1 = filter(regex.match, tables)
number2 = table.next_sibling.strip()
number3 =
list_of_number1.append(number1)
list_of_number2.append(number2)
list_of_number3.append(number3)
Do i use beautifulsoup for the third number or is it feasible to regex through the whole html for any number following "/a>-"
Here is how you can obtain your result using just the information that you need to get the numbers in the specific a element and in the text node immediately on the right:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all('a', attrs={'target': 'new'})
print([(t.text, t["href"].split('=')[-1], t.next_sibling.replace('-', '')) for t in tables])
# => [('4449', '99', '4450'), ('5111', '77', '5550')]
You may certainly go the harder way with regexps:
import re
#... the same initialization code as above
for t in tables:
page = ""
page_m = re.search(r"[#?]page=(\d+)", t["href"])
if page_m:
page = page_m.group(1)
else:
page = ""
num = "".join([x for x in t.next_sibling if x.isdigit()])
results.append((int(t.text), int(page), int(num)))
print(results)
# => [(4449, 99, 4450), (5111, 77, 5550)]
NOTE:
t.text - gets the element text
t["href"] - gets the href attribute value of the t element
t.next_sibling - gets the next node after current one that is on the same hierarchy level.
You can also try:
for b in soup.select('a'):
print(b.attrs['href'].split('=')[1], b.text, b.nextSibling)
Output:
99 4449 -4450
77 5111 -5550

Extract text in proper format (with spaces in between) from <td> tags using beautiful soup

I am trying to extract column headings from one of the tables from ABBV 10-k sec filing (`Issuer Purchases of Equity Securities' table on page 25 - below the graph.)
inside <td> tag in the column heading <tr> tag, text is in separate <div> tags as in the example below
<tr>
<td>
<div>string1</div>
<div>string2</div>
<div>string3</div>
</td>
</tr>
when trying to extract all text fro a tag, there is no space separation between texts (e.g. for the above html output will be string1string3string3 expectedstring1 string3 string3).
Using below code to extract column headings from table
url = 'https://www.sec.gov/Archives/edgar/data/1551152/000155115218000014/abbv-20171231x10k.htm'
htmlpage = requests.get(url)
soup = BeautifulSoup(htmlpage.text, "lxml")
table = soup.find_all('table')[76]
rows = table.find_all('tr')
table_data = []
for tr in rows[2:3]:
row_data=[]
cells = tr.find_all(['td', 'th'], recursive=False)
for cell in cells[1:4]:
row_data.append(cell.text.encode('utf-8'))
table_data.append([x.decode('utf-8').strip() for x in row_data])
print(table_data)
output:[['(a) TotalNumberof Shares(or Units)Purchased', '', '(b) AveragePricePaid per Share(or Unit)']]
Expected output:[['(a) Total Number of Shares (or Units) Purchased', '', '(b) Average Price Paid per Share (or Unit)']] (each word separated bay a space)
use the separator parameter with .get_text():
html = '''<tr>
<td>
<div>string1</div>
<div>string2</div>
<div>string3</div>
</td>
</tr>'''
import bs4
soup = bs4.BeautifulSoup(html, 'html.parser')
td = soup.find('td')
td.get_text(separator=' ')
Here's how it looks with your code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.sec.gov/Archives/edgar/data/1551152/000155115218000014/abbv-20171231x10k.htm'
htmlpage = requests.get(url)
soup = BeautifulSoup(htmlpage.text, "lxml")
table = soup.find_all('table')[76]
rows = table.find_all('tr')
table_data = []
for tr in rows[2:3]:
row_data=[]
cells = tr.find_all(['td', 'th'], recursive=False)
for cell in cells[1:4]:
row_data.append(cell.get_text(separator=' ').encode('utf-8'))
table_data.append([x.decode('utf-8').strip() for x in row_data])
print(table_data)
Output:
print(table_data)
[['(a) Total Number of Shares (or Units) Purchased', '', '(b) Average Price Paid per Share (or Unit)']]

Converting a HTML table to a CSV in Python

I am trying to convert a table in HTML to a csv in Python. The table I am trying to extract is this one:
<table class="tblperiode">
<caption>Dades de període</caption>
<tr>
<th class="sortable"><span class="tooltip" title="Període (Temps Universal)">Període</span><br/>TU</th>
<th><span class="tooltip" title="Temperatura mitjana (°C)">TM</span><br/>°C</th>
<th><span class="tooltip" title="Temperatura màxima (°C)">TX</span><br/>°C</th>
<th><span class="tooltip" title="Temperatura mínima (°C)">TN</span><br/>°C</th>
<th><span class="tooltip" title="Humitat relativa mitjana (%)">HRM</span><br/>%</th>
<th><span class="tooltip" title="Precipitació (mm)">PPT</span><br/>mm</th>
<th><span class="tooltip" title="Velocitat mitjana del vent (km/h)">VVM (10 m)</span><br/>km/h</th>
<th><span class="tooltip" title="Direcció mitjana del vent (graus)">DVM (10 m)</span><br/>graus</th>
<th><span class="tooltip" title="Ratxa màxima del vent (km/h)">VVX (10 m)</span><br/>km/h</th>
<th><span class="tooltip" title="Irradiància solar global mitjana (W/m2)">RS</span><br/>W/m<sup>2</sup></th>
</tr>
<tr>
<th>
00:00 - 00:30
</th>
<td>16.2</td>
<td>16.5</td>
<td>15.4</td>
<td>93</td>
<td>0.0</td>
<td>6.5</td>
<td>293</td>
<td>10.4</td>
<td>0</td>
</tr>
<tr>
<th>
00:30 - 01:00
</th>
<td>16.4</td>
<td>16.5</td>
<td>16.1</td>
<td>90</td>
<td>0.0</td>
<td>5.8</td>
<td>288</td>
<td>8.6</td>
<td>0</td>
</tr>
And I want it to look something like this:
To achieve so, what I have tried is to parse the html and I have managed to build a dataframe with the data correctly doing the following:
from bs4 import BeautifulSoup
import csv
html = open("table.html").read()
soup = BeautifulSoup(html)
table = soup.select_one("table.tblperiode")
output_rows = []
for table_row in table.findAll('tr'):
columns = table_row.findAll('td')
output_row = []
for column in columns:
output_row.append(column.text)
output_rows.append(output_row)
df = pd.DataFrame(output_rows)
print(df)
However, I would like to have the columns name and a column indicating the interval of time, in the example of html above just two of them appear 00:00-00:30 and 00:30 1:00. Therefore my table should have two rows, one corresponding with the observations of 00:00-00:30 and another one with the observations of 00:30 and 1:00.
How could I get this information from my HTML?
Here's a way of doing it, it's probably not the nicest way but it works! You can read through the comments to figure out what the code is doing!
from bs4 import BeautifulSoup
import csv
#read the html
html = open("table.html").read()
soup = BeautifulSoup(html, 'html.parser')
# get the table from html
table = soup.select_one("table.tblperiode")
# find all rows
rows = table.findAll('tr')
# strip the header from rows
headers = rows[0]
header_text = []
# add the header text to array
for th in headers.findAll('th'):
header_text.append(th.text)
# init row text array
row_text_array = []
# loop through rows and add row text to array
for row in rows[1:]:
row_text = []
# loop through the elements
for row_element in row.findAll(['th', 'td']):
# append the array with the elements inner text
row_text.append(row_element.text.replace('\n', '').strip())
# append the text array to the row text array
row_text_array.append(row_text)
# output csv
with open("out.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(header_text)
# loop through each row array
for row_text_single in row_text_array:
wr.writerow(row_text_single)
With this script:
import csv
from bs4 import BeautifulSoup
html = open('table.html').read()
soup = BeautifulSoup(html, features='lxml')
table = soup.select_one('table.tblperiode')
rows = []
for i, table_row in enumerate(table.findAll('tr')):
if i > 0:
periode = [' '.join(table_row.findAll('th')[0].text.split())]
data = [x.text for x in table_row.findAll('td')]
rows.append(periode + data)
header = ['Periode', 'TM', 'TX', 'TN', 'HRM', 'PPT', 'VVM', 'DVM', 'VVX', 'PM', 'RS']
with open('result.csv', 'w', newline='') as f:
w = csv.writer(f)
w.writerow(header)
w.writerows(rows)
I've managed to generate following CSV file on output:
Periode,TM,TX,TN,HRM,PPT,VVM,DVM,VVX,PM,RS
00:00 - 00:30,16.2,16.5,15.4,93,0.0,6.5,293,10.4,0
00:30 - 01:00,16.4,16.5,16.1,90,0.0,5.8,288,8.6,0
import csv
from bs4 import BeautifulSoup
import pandas as pd
html = open('test.html').read()
soup = BeautifulSoup(html, features='lxml')
#Specify table name which you want to read.
#Example: <table class="queryResults" border="0" cellspacing="1">
table = soup.select_one('table.queryResults')
def get_all_tables(soup):
return soup.find_all("table")
tbls = get_all_tables(soup)
for i, tablen in enumerate(tbls, start=1):
print(i)
print(tablen)
def get_table_headers(table):
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
head = get_table_headers(table)
#print(head)
def get_table_rows(table):
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
table_rows = get_table_rows(table)
#print(table_rows)
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
save_as_csv("Test_table", head, table_rows)

How can I extract the value inside the curly brackets in python beautiful soup?

HTML excerpt
This is my code so far:
s = BS(r.content, 'lxml')
findDiv = s.find('div', {'id':'BodyContentPlaceholder_T00DF23F8005_Col00'})
findTable = findDiv.findAll('table', {'style':'width: 100%; border-collapse: collapse;'})
for table in findTable:
rows = table.findAll('tr')
rows = rows[1:]
for row in rows:
cells = row.findAll('td')
extension = cells[3].button.input.text
print('docs page ' + str(extension))
#extention = re.compile(pattern, text)
docpage_url_ending = cells[3].find('value')
print('url ' + str(docpage_url_ending))
I'm trying to get navigateUrl text from this
<input id="ctl00_BodyContentPlaceholder_C013_RadListView1_ctrl1_rlb1_ClientState" name="ctl00_BodyContentPlaceholder_C013_RadListView1_ctrl1_rlb1_ClientState" type="hidden" autocomplete="off" value="{"text":"Documents","value":"","target":"","navigateUrl":"/procurement/procurement-bids/constructionprocurementdetail?Title=Lakeland Adult Daycare Center Roof Replacement 18-785","primary":false}">
import json
import bs4
import urllib
data = '<input id="ctl00_BodyContentPlaceholder_C013_RadListView1_ctrl1_rlb1_ClientState" name="ctl00_BodyContentPlaceholder_C013_RadListView1_ctrl1_rlb1_ClientState" type="hidden" autocomplete="off" value="{"text":"Documents","value":"","target":"","navigateUrl":"/procurement/procurement-bids/constructionprocurementdetail?Title=Lakeland Adult Daycare Center Roof Replacement 18-785","primary":false}">'
# cook up some soup
soup = bs4.BeautifulSoup(data)
# extract the relevant attribute
vals_as_string = soup.html.body.input.attrs['value']
# it's urlencoded, so decode it
unquoted_vals_as_string = urllib.parse.unquote(vals_as_string)
# turns out, it's json
vals_as_json = json.loads(unquoted_vals_as_string)
# well, json converts to dict, so there's our target
navigateUrl = vals_as_json['navigateUrl']
That's how you get to your navigateUrl:
import json
from bs4 import BeautifulSoup as BS
s = BS(r.content, 'lxml')
findDiv = s.find('div', {'id':'BodyContentPlaceholder_T00DF23F8005_Col00'})
findTable = findDiv.findAll('table', {'style':'width: 100%; border-collapse: collapse;'})
for table in findTable:
rows = table.findAll('tr')
rows = rows[1:]
for row in rows:
cells = row.findAll('td')
extension = cells[3].button.input.text
print('docs page ' + str(extension))
#extention = re.compile(pattern, text)
docpage_url_ending = json.loads(cells[3].button.input.attr['value'])['navigateUrl']
print('url ' + docpage_url_ending)

BeautifulSoup: Can't Access Info Within TD

I'm looking at the following website:
https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859
I want to extract the name of each university and the href associated with it. So for the first entry, I'd like to get Stanford and https://modules.ussquash.com/ssm/pages/leagues/Team_Information.asp?id=18564
I've gotten to the point where I have all of the TDs, using BeautifulSoup. I'm just having difficulty extracting the school and its href.
Here's my attempt:
def main():
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table')[1]
rows = table.find_all('tr')[1:]
for row in rows:
cols = row.find_all('td')
print(cols)
When I try to access cols[0], I get:
IndexError: list index out of range
Any idea how to fix this would be awesome!
Thanks
The first two tr's are in the thead which have no td tags, you want to skip the first two tr's:
rows = table.find_all('tr')[2:]
To get what you want, we can simplify using css selectors:
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
# anchor we want is inside the first td
a = row.select_one("td a") # or a = row.find("td").a
print(a.text,a["href"])
Also the href is a relative path so you need to join it to a base url:
import requests
from bs4 import BeautifulSoup
from urllib.urlparse import urljoin
def main():
base = "https://modules.ussquash.com/ssm/pages/leagues/"
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
a = row.select_one("td a")
print(a.text, urljoin(base, a["href"]))

Categories

Resources