How to scrape url from a html table in python and BeautifulSoup

How to scrape url from a html table in python and BeautifulSoup - python

I want to scrape the urls from a html table of this website. I was able to gather LOCATION | DATE | SUMMARY | DEADLINE. But the SUMMARY field is having a url to another page. I want to scrape the entire table along with this url so my scraped data becomes LOCATION | DATE | SUMMARY | DEADLINE | URLS
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
links = [th.findAll('a')]
else:
for td in tds:
cells.append(td.text.strip())
links = [td.findAll('a')]
rows.append(cells)

You'll need to get the '' tag under the <td> tag, and pull out the href attribute.
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
headers = []
for i in range(1,amount_of_pages+1): #<-- if theres 4796 pages, your range needs to be to 4797. range goes from (start, end) but the is not inclusive of the end value
response = rq.get(url.format(i))
print (i)
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
if len(headers) == 0:
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
headers.append('URL')
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
for td in tds:
cells.append(td.text.strip())
if td.find('a'):
link = td.find('a')['href']
cells = cells + [link]
rows.append(cells)
df = pd.DataFrame(rows,
columns =headers)

Related

How can Beautifulsoup scrape the pages inside this list of hyperlinks?

I am trying to scrape the contents of the hyperlinks on the left side of this page. I am already able to scrape the contents of the hyperlinks, so now I am trying to run the script on each individual hyperlink that is on the left side of the page.
URL: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html
I think what needs to be done is the url be a dynamic variable, and that variable is a loop which will go through all of the hyperlinks in the URL above. Although I'm not exactly sure if this is the best way to approach it, as this is my first project
Any advice is greatly appreciated.
Here is the code that I am trying to plug this into.
import csv
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
headers = []
datarows = []
#Get crypto address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)

A simple way would be to make an initial request and extract all the links in the second column of the table.
Then loop those links, make requests, and continue with your existing code, except to also handle cases where no table present.
import csv
import requests
from bs4 import BeautifulSoup as bs
headers = []
datarows = []
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html')
soup = bs(r.content, 'lxml')
address_links = [i['href'] for i in soup.select('.table td:nth-child(2) > a')]
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
if table:
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
else:
print('no table for: ', url)

Scraping tag attribute BeautifulSoup

I would scrape all data-oid tag from this page, but return nothing in the output
Code
url = 'https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='table-main')
for rows in table.find_all('tr')[1:]:
for row in rows.find_all('td'):
data = row.get_attrs['data-oid']
print(data)

The part table part of the page is loaded from external URL via JavaScript. To get the data along with the tags with data-oid= parameters, you can use this example:
import requests
from bs4 import BeautifulSoup
url = "https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/"
match_id = "EDwej14E" # <-- this is the last part of URL
api_url = "https://www.betexplorer.com/match-odds/{}/1/1x2/".format(match_id)
headers = {"Referer": "https://www.betexplorer.com"}
data = requests.get(api_url, headers=headers).json()
soup = BeautifulSoup(data["odds"], "html.parser")
# your code:
table = soup.find("table", class_="table-main")
for rows in table.find_all("tr")[1:]:
for row in rows.select("td[data-oid]"):
data = row["data-oid"]
print(data)
Prints:
...
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif

Beautiful Soup scrape table with table breaks

I'm trying to scrape a table into a dataframe. My attempt only returns the table name and not the data within rows for each region.
This is what i have so far:
from bs4 import BeautifulSoup as bs4
import requests
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
table_regions = soup.find('table', {'class': "t4"})
regions = table_regions.find_all('tr')
for row in regions:
print row
ideal outcome i'd like to get:
region | price
---------------|-------
new england | 2.59
new york city | 2.52
Thanks for any assistance.

If you check your html response (soup) you will see that the table tag you get in this line table_regions = soup.find('table', {'class': "t4"}) its closed up before the rows that contain the information you need (the ones that contain the td's with the class names: up dn d1 and s1.
So how about using the raw td tags like this:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
a = soup.find_all('tr')
rows = []
subel = []
for tr in a[42:50]:
b = tr.find_all('td')
for td in b:
subel.append(td.string)
rows.append(subel)
subel = []
df = pd.DataFrame(rows, columns=['Region','Price_1', 'Percent_change_1', 'Price_2', 'Percent_change_2', 'Spark Spread'])
Notice that I use just the a[42:50] slice of the results because a contains all the td's of the website. You can use the rest too if you need to.

How to loop through a list of urls for web scraping with BeautifulSoup

Does anyone know how to scrape a list of urls from the same website by Beautifulsoup? list = ['url1', 'url2', 'url3'...]
==========================================================================
My code to extract a list of urls:
url = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=2'
url1 = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=3'
url2 = 'http://www.hkjc.com/chinese/racing/selecthorsebychar.asp?ordertype=4'
r = requests.get(url)
r1 = requests.get(url1)
r2 = requests.get(url2)
data = r.text
soup = BeautifulSoup(data, 'lxml')
links = []
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
data1 = r1.text
soup = BeautifulSoup(data1, 'lxml')
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
data2 = r2.text
soup = BeautifulSoup(data2, 'lxml')
for link in soup.find_all('a', {'class': 'title_text'}):
links.append(link.get('href'))
new = ['http://www.hkjc.com/chinese/racing/']*1123
url_list = ['{}{}'.format(x,y) for x,y in zip(new,links)]
code to extract from a single page of url:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'myurl'
r = requests.get(myurl)
r.encoding = 'utf-8'
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
soup.findAll('tr')[27].findAll('td')
column_headers = [th.getText() for th in
soup.findAll('tr')[27].findAll('td')]
data_rows =soup.findAll('tr')[29:67]
data_rows
player_data = [[td.getText() for td in data_rows[i].findAll('td', {'class':['htable_text', 'htable_eng_text']})]
for i in range(len(data_rows))]
player_data_02 = []
for i in range(len(data_rows)):
player_row = []
for td in data_rows[i].findAll('td'):
player_row.append(td.getText())
player_data_02.append(player_row)
df = pd.DataFrame(player_data, columns=column_headers[:18])

Based on your links subset collection of table data goes like this:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
url_list = ['http://www.hkjc.com/english/racing/horse.asp?HorseNo=S217','http://www.hkjc.com/english/racing/horse.asp?HorseNo=A093','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V344','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V077', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=P361', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=T103']
for link in url_list:
r = requests.get(link)
r.encoding = 'utf-8'
html_content = r.text
soup = BS(html_content, 'lxml')
table = soup.find('table', class_='bigborder')
if not table:
continue
trs = table.find_all('tr')
if not trs:
continue #if trs are not found, then starting next iteration with other link
headers = trs[0]
headers_list=[]
for td in headers.find_all('td'):
headers_list.append(td.text)
headers_list+=['Season']
headers_list.insert(19,'pseudocol1')
headers_list.insert(20,'pseudocol2')
headers_list.insert(21,'pseudocol3')
res=[]
row = []
season = ''
for tr in trs[1:]:
if 'Season' in tr.text:
season = tr.text
else:
tds = tr.find_all('td')
for td in tds:
row.append(td.text.strip('\n').strip('\r').strip('\t').strip('"').strip()) #clean data
row.append(season.strip())
res.append(row)
row=[]
res = [i for i in res if i[0]!='']
df=pd.DataFrame(res, columns=headers_list)
del df['pseudocol1'],df['pseudocol2'],df['pseudocol3']
del df['VideoReplay']
df.to_csv('/home/username/'+str(url_list.index(link))+'.csv')
if you want to store data from all tables to one dataframe, this little modification will do the trick:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
url_list = ['http://www.hkjc.com/english/racing/horse.asp?HorseNo=S217','http://www.hkjc.com/english/racing/horse.asp?HorseNo=A093','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V344','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V077', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=P361', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=T103']
res=[] #placing res outside of loop
for link in url_list:
r = requests.get(link)
r.encoding = 'utf-8'
html_content = r.text
soup = BS(html_content, 'lxml')
table = soup.find('table', class_='bigborder')
if not table:
continue
trs = table.find_all('tr')
if not trs:
continue #if trs are not found, then starting next iteration with other link
headers = trs[0]
headers_list=[]
for td in headers.find_all('td'):
headers_list.append(td.text)
headers_list+=['Season']
headers_list.insert(19,'pseudocol1')
headers_list.insert(20,'pseudocol2')
headers_list.insert(21,'pseudocol3')
row = []
season = ''
for tr in trs[1:]:
if 'Season' in tr.text:
season = tr.text
else:
tds = tr.find_all('td')
for td in tds:
row.append(td.text.strip('\n').strip('\r').strip('\t').strip('"').strip())
row.append(season.strip())
res.append(row)
row=[]
res = [i for i in res if i[0]!=''] #outside of loop
df=pd.DataFrame(res, columns=headers_list) #outside of loop
del df['pseudocol1'],df['pseudocol2'],df['pseudocol3']
del df['VideoReplay']
df.to_csv('/home/Username/'+'tables.csv') #outside of loop

Python BeautifulSoup MySQL storage and interation

First, there is a leading "1" in the return string and I'm encountering trouble iterating passed it - I've tried using [0:]: method and get stuck somewhere. I'd like to skip drop it or skip it to get to the second value that is the id val. the scraped table
Additionally, in trying to format returned items from table for storage - I've been getting index out of range errors. I've been using def store().
import requests
from bs4 import BeautifulSoup
import MySQLdb
#mysql portion
mydb = MySQLdb.connect(host='****',
user= '****',
passwd='****',
db='****')
cur = mydb.cursor()
def store (id, ticker):
cur.execute('INSERT IGNORE INTO TEST (id, ticker) VALUES (\"%s\", \"%s\")',(id, ticker))
cur.connection.commit()
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
table = main_div.find('table')
sub = table.findAll('tr')
cells = sub[5].findAll('td')
for cell in cells:
link = cell.a
if link is not None:
link = link.get_text()
id = link[0]
ticker = link[1]
store(id, ticker)
print(link)

I don't know what you real try to do but this works for me
import requests
from bs4 import BeautifulSoup
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
rows = soup.find_all('tr', class_=["table-dark-row-cp", "table-light-row-cp"])
for row in rows:
columns = row.find_all('td')
id_ = columns[0].a.get_text()
ticker = columns[1].a.get_text()
company = columns[2].a.get_text()
sector = columns[3].a.get_text()
industry = columns[4].a.get_text()
print(id_, ticker, company, sector, industry)
Or event with a
for row in rows:
columns = row.find_all('a')
id_ = columns[0].get_text()
ticker = columns[1].get_text()
company = columns[2].get_text()
sector = columns[3].get_text()
industry = columns[4].get_text()
print(id_, ticker, company, sector, industry)
BTW: you can also use CSS selector
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr[class]')
or
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr')
# skip first row with headers
rows = rows[1:]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape url from a html table in python and BeautifulSoup - python

Related

How can Beautifulsoup scrape the pages inside this list of hyperlinks?

Scraping tag attribute BeautifulSoup

Beautiful Soup scrape table with table breaks

How to loop through a list of urls for web scraping with BeautifulSoup

Python BeautifulSoup MySQL storage and interation

Categories

Resources