Find .nextsibling with Beautifulsoup4 - python

I am trying to get (some) contents of a table from an URL.
So far I have managed to get two desired contents of the page, but there is a third one (third column) that I would like to get only its text. The problem is, the underlying link exists elsewhere on the page (with different text) and if I want to load the table into an SQL database, the contents of the third colum won't match the first two columns.
import urllib2
from bs4 import BeautifulSoup4
startURL = "http://some.url/website.html"
page = urllib2.urlopen(startURL).read()
soup = BeautifulSoup(page, "html.parser")
for links in soup.findAll("a"):
if "href" in links.attrs:
www = links.attrs.values()
if not "https://" in www[0]: # to exclude all non-relative links, e.g. external links
if "view/" in www[0]: # To get only my desired links of column 1
link_of_column1 = www[0] # this is now my wanted link
Okay, so with this code I can get the second column. Where and how would I have to apply the .nextsibling() function, to get the next link in the next (3rd) column?
Edit:
As I have been asked: The URL is https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html and I want to get the contents from Column 2 and 3, which is "Hosting Company" (link-text and link) and "Country" (only the text).
Edit2:
Another thing I forgot...how can I extract the information that its 137,157 records?

First find the table which has all the info using its id=web_hosting_tbl attribute. Then iterate over all the rows of the table. But, if you look at the page source, the rows you need are not consecutive, but, alternate, and they don't have any class names. Also, the first row of the table is the header row, so we've to skip that.
After getting the required rows (using table.find_all('tr')[1::2]), find all the columns and then get the required information from the corresponding columns.
Code:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html')
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table', id='web_hosting_tbl')
for row in table.find_all('tr')[1::2]:
all_columns = row.find_all('td')
name = all_columns[1].a.text
link = all_columns[1].a['href']
country = all_columns[2].a.text
print(name, link, country, sep=' | ')
Partial output:
Godaddy.com, LLC | /view/web_hosting/2433/Godaddy_com_LLC.html | USA
Cloudflare, Inc | /view/web_hosting/4638/Cloudflare_Inc.html | USA
Amazon.com, Inc | /view/web_hosting/615/Amazon_com_Inc.html | USA
Ovh Sas | /view/web_hosting/7593/Ovh_Sas.html | France
Hetzner Online Ag | /view/web_hosting/45081/Hetzner_Online_Ag.html | Germany
Hostgator.com Llc | /view/web_hosting/26757/Hostgator_com_Llc.html | USA
Google Inc | /view/web_hosting/617/Google_Inc.html | USA
Bluehost Inc | /view/web_hosting/3886/Bluehost_Inc.html | USA
...

Code: (Python 3.6+, used f-strings)
import urllib.parse
from collections import namedtuple
from datetime import datetime
import bs4
import requests
HostingCompany = namedtuple('HostingCompany',
('name', 'country', 'websites', 'usage', 'usage_by_top', 'update_time'))
class MyIpLink:
url_base = 'https://myip.ms'
def __init__(self, tag: bs4.element.Tag, *, is_anchor=False):
a_tag = tag.find('a')
if is_anchor: # treat `tag` as an anchor tag
a_tag = tag
self.text = tag.text.strip()
self.url = urllib.parse.urljoin(self.url_base, a_tag['href'])
def __repr__(self):
return f'{self.__class__.__name__}(text={repr(self.text)}, url={repr(self.url)})'
url = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
html = requests.get(url).text
soup = bs4.BeautifulSoup(html, 'html.parser')
rows = soup.select('#web_hosting_tbl > tbody > tr')[::2] # skips "more info" rows
companies = []
for row in rows:
tds = row.find_all('td')
name = MyIpLink(tds[1])
country = MyIpLink(tds[2])
websites = [MyIpLink(a, is_anchor=True) for a in tds[3].find_all('a')]
usage = MyIpLink(tds[4])
usage_by_top = MyIpLink(tds[5])
update_time = datetime.strptime(tds[6].text.strip(), '%d %b %Y, %H:%M')
company = HostingCompany(name, country, websites, usage, usage_by_top, update_time)
companies.append(company)
import pprint
pprint.pprint(companies)
print(companies[0].name.text)
print(companies[0].name.url)
print(companies[0].country.text)
Output:
[HostingCompany(name=MyIpLink(text='Godaddy.com, LLC', url='https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.godaddy.com', url='https://myip.ms/go.php?1229687315_ITg7Im93dCkWE0kNAhQSEh0FUeHq5Q==')], usage=MyIpLink(text='512,701 sites', url='https://myip.ms/browse/sites/1/ownerID/2433/ownerIDii/2433'), usage_by_top=MyIpLink(text='951 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/2433/ownerIDii/2433'), update_time=datetime.datetime(2018, 5, 2, 5, 17)),
HostingCompany(name=MyIpLink(text='Cloudflare, Inc', url='https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.cloudflare.com', url='https://myip.ms/go.php?840626136_OiEsK2ROSxAdGl4QGhYJG+Tp6fnrv/f49w==')], usage=MyIpLink(text='488,119 sites', url='https://myip.ms/browse/sites/1/ownerID/4638/ownerIDii/4638'), usage_by_top=MyIpLink(text='16,160 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/4638/ownerIDii/4638'), update_time=datetime.datetime(2018, 5, 2, 5, 10)),
HostingCompany(name=MyIpLink(text='Amazon.com, Inc', url='https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.amazonaws.com', url='https://myip.ms/go.php?990446041_JyYhKGFxThMQHUMRHhcDExHj8vul7f75')], usage=MyIpLink(text='453,230 sites', url='https://myip.ms/browse/sites/1/ownerID/615/ownerIDii/615'), usage_by_top=MyIpLink(text='9,557 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/615/ownerIDii/615'), update_time=datetime.datetime(2018, 5, 2, 5, 4)),
...
]
Godaddy.com, LLC
https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html
USA
Gonna update the answer with some explanation in the evening. Cheers!

Try the below approach. It should give you the texts from column 2, the links from column 2 and again the texts from column 3 out of that table. I used lxml instead of BeautifulSoup to make it faster. Thanks.
import requests
from urllib.parse import urljoin
from lxml.html import fromstring
URL = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
res = requests.get(URL)
root = fromstring(res.text)
for items in root.cssselect('#web_hosting_tbl tr:not(.expand-child)')[1:]:
name = items.cssselect("td.row_name a")[0].text
link = urljoin(URL,items.cssselect("td.row_name a")[0].attrib['href'])
country = items.cssselect("td a[href^='/view/best_hosting/']")[0].text
print(name, link, country)
Results:
Godaddy.com, LLC https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html USA
Cloudflare, Inc https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html USA
Amazon.com, Inc https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html USA
Ovh Sas https://myip.ms/view/web_hosting/7593/Ovh_Sas.html France
Hetzner Online Ag https://myip.ms/view/web_hosting/45081/Hetzner_Online_Ag.html Germany
Hostgator.com Llc https://myip.ms/view/web_hosting/26757/Hostgator_com_Llc.html USA
Google Inc https://myip.ms/view/web_hosting/617/Google_Inc.html USA

Related

Web scraping headlines and subtitles from a specific section with a specific tag id

Tried the following code to get the headlines and subtitles from the section https://www.theguardian.com/uk/environment with the id: environment/wildlife for the period of time October, November, December 2022.
Any ideas on what can be wrong? Thanks for your help.
`
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.find_all('gu-island')
filtered_elements = [element for element in elements if element['props']['id'] == 'environment/wildlife']
headlines = []
for element in filtered_elements:
headlines.extend(element.find_all('h1'))
texts = [headline.text for headline in headlines]
print(texts)
`
Also tried with the keyword wildlife but no results.
`
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.find_all('gu-island')
filtered_elements = [element for element in elements if 'wildlife' in element['props']['tags']]
headlines = []
for element in filtered_elements:
headlines.extend(element.find_all('h1'))
texts = [headline.text for headline in headlines]
print(texts)
`
The following code with the url of a specific article, does extract the headline subtitle but also the entire text of the article which I'm not requesting.
`
r = requests.get('https://www.theguardian.com/environment/2022/dec/30/tales-of-killer-wild-boar-in-uk-are-hogwash-say-environmentalists')
soup = BeautifulSoup(r.text, 'html.parser')
headlines = soup.find_all('h1')
standfirst = soup.find_all('p')
for headline in headlines:
print(headline.text)
for standfirst in standfirst:
print(standfirst.text)
`
Just inspect your page, I'm assuming you want the title for every article from the main page as you didn't specify it clearly.
With the information, you can search the section as follows,
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all("div", {"class": "fc-item__content"})
for item in items:
print(item.text.strip())
Tip: use .strip() to get the inner text within the A tag.
Output gives you:
The following script will extract all of the headlines from the environment/wildlife pages.
An example url from the 3rd Oct 2022 would be:
https://www.theguardian.com/environment/wildlife/2022/oct/03/all
You can modify the script to specify the required start_date and end_date.
Please note, you will have to specify an end_date that is one day beyond the end date you want.
All of the headlines within those dates will be stored in the headlines variable.
I have also introduced a sleep time of 10 seconds between page reads, to avoid being blocked by the website.
Code:
from bs4 import BeautifulSoup
import requests
from datetime import date, timedelta
import pandas as pd
from time import sleep
def get_headings(dt):
p = dt.strftime("%Y-%b-%d").split('-')
r = requests.get(f'https://www.theguardian.com/environment/wildlife/{p[0]}/{p[1].lower()}/{p[2]}/all')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.select('div.fc-slice-wrapper')
headings = [h.text for h in elements[0].find_all(class_="js-headline-text")][::2]
return headings
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2022,10,1)
end_date = date(2022,10,4)
headlines = []
for single_date in daterange(start_date, end_date):
headlines.extend(get_headings(single_date))
sleep(10) # sleep 10 seconds between each page to avoid being blocked
for h in headlines:
print(h)
Output:
Gardeners beware: household chemicals banned overseas are still used in Australia
Cop15 is an opportunity to save nature. We can’t afford another decade of failure
Prince Harry wildlife NGO under fire after elephants kill three in Malawi
Country diary: Mysterious birdsong fills the air with sweetness
Tory MPs dismiss critical RSPB campaign as ‘marketing strategy’
Australia announces plan to halt extinction crisis and save 110 species
Sixty endangered greater gliders found in Victorian forests tagged for logging
Wales unveils plans to triple rate of peatland restoration
Europe and UK hit by ‘unprecedented’ number of bird flu cases this summer

Python University Names and Abbrevations and Weblink

I want to prepare a dataframe of universities, its abbrevations and website link.
My code:
abb_url = 'https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States'
abb_html = requests.get(abb_url).content
abb_df_list = pd.read_html(abb_html)
Present answer:
ValueError: No tables found
Expected answer:
df =
| | university_full_name | uni_abb | uni_url|
---------------------------------------------------------------------
| 0 | Albert Einstein College of Medicine | AECOM | https://en.wikipedia.org/wiki/Albert_Einstein_College_of_Medicine|
That's one funky page you have there...
First, there are indeed no tables in there. Second, some organizations don't have links, others have redirect links and still others use the same abbreviation for more than one organization.
So you need to bring in the heavy artillery: xpath...
import pandas as pd
import requests
from lxml import html as lh
url = "https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States"
response = requests.get(url)
doc = lh.fromstring(response.text)
rows = []
for uni in doc.xpath('//h2[./span[#class="mw-headline"]]//following-sibling::ul//li'):
info = uni.text.split(' – ')
abb = info[0]
#for those w/ no links
if not uni.xpath('.//a'):
rows.append((abb," ",info[1]))
#now to account for those using the same abbreviation for multiple teams
for a in uni.xpath('.//a'):
dat = a.xpath('./#*')
#for those with redirects
if len(dat)==3:
del dat[1]
link = f"https://en.wikipedia.org{dat[0]}"
rows.append((abb,link,dat[1]))
#and now, at last, to the dataframe
cols = ['abb','url','full name']
df = pd.DataFrame(rows,columns=cols)
df
Output:
abb url full name
0 AECOM https://en.wikipedia.org/wiki/Albert_Einstein_... Albert Einstein College of Medicine
1 AFA https://en.wikipedia.org/wiki/United_States_Ai... United States Air Force Academy
etc.
Note: you can rearrange the order of columns in the dataframe, if you are so inclined.
Select and iterate only the expected <li> and extract its information, but be aware there is a university without an <a> (SUI – State University of Iowa), so this should be handled with if-statement in example:
for e in soup.select('h2 + ul li'):
data.append({
'abb':e.text.split('-')[0],
'full_name':e.text.split('-')[-1],
'url':'https://en.wikipedia.org' + e.a.get('href') if e.a else None
})
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States"
response = requests.get(url)
soup = BeautifulSoup(response.text)
data = []
for e in soup.select('h2 + ul li'):
data.append({
'abb':e.text.split('-')[0],
'full_name':e.text.split('-')[-1],
'url':'https://en.wikipedia.org' + e.a.get('href') if e.a else None
})
pd.DataFrame(data)
Output:
abb
full_name
url
0
AECOM
Albert Einstein College of Medicine
https://en.wikipedia.org/wiki/Albert_Einstein_College_of_Medicine
1
AFA
United States Air Force Academy
https://en.wikipedia.org/wiki/United_States_Air_Force_Academy
2
Annapolis
U.S. Naval Academy
https://en.wikipedia.org/wiki/United_States_Naval_Academy
3
A&M
Texas A&M University, but also others; see A&M
https://en.wikipedia.org/wiki/Texas_A%26M_University
4
A&M-CC or A&M-Corpus Christi
Corpus Christi
https://en.wikipedia.org/wiki/Texas_A%26M_University%E2%80%93Corpus_Christi
...
There are no tables on this page, but lists. So the goal will be to go through the <ul> and then <li> tags, skipping the paragraphs you are not interested in (the first and those after the 26th).
You can extract aab_code of the university this way:
uni_abb = li.text.strip().replace(' - ', ' - ').replace(' - ', ' - ').split(' - ')[0]
while to get the url you have to access the 'href' and 'title' parameter inside the <a> tag:
for a in li.find_all('a', href=True):
title = a['title']
url= f"https://en.wikipedia.org/{a['href']}"
Accumulate the extracted information into a list, and finally create the dataframe by assigning appropriate column names.
Here is the complete code, in which I use BeautifulSoup:
import requests
import pandas as pd
from bs4 import BeautifulSoup
abb_url = 'https://en.wikipedia.org/wiki/List_of_colloquial_names_for_universities_and_colleges_in_the_United_States'
abb_html = requests.get(abb_url).content
soup = BeautifulSoup(abb_html)
l = []
for ul in soup.find_all("ul")[1:26]:
for li in ul.find_all("li"):
uni_abb = li.text.strip().replace(' - ', ' – ').replace(' — ', ' – ').split(' – ')[0]
for a in li.find_all('a', href=True):
l.append((a['title'], uni_abb, f"https://en.wikipedia.org/{a['href']}"))
df = pd.DataFrame(l, columns=['university_full_name', 'uni_abb', 'uni_url'])
Result:
university_full_name uni_abb uni_url
0 Albert Einstein College of Medicine AECOM https://en.wikipedia.org//wiki/Albert_Einstein...
1 United States Air Force Academy AFA https://en.wikipedia.org//wiki/United_States_A...

Scrape zoho-analitics externally stored table. Is it possible?

I am trying to scrape a zoho-analytics table from this webpage for a project at the university. For the moment I have no ideas. I can't see the values in the inspect, and therefore I cannot use Beautifulsoup in Python (my favourite one).
enter image description here
Does anybody have any idea?
Thanks a lot,
Joseph
I tried it with BeautifulSoup, seems like you can't soup these values that are inside the table because they are not on the website but stored externally(?)
EDIT:
https://analytics.zoho.com/open-view/938032000481034014
This is the link the table and its data are stored.
So I tried scraping from it with bs4 and it works.
The class of the rows is "zdbDataRowDiv"
Try:
container = page_soup.findAll("div","class":"zdbDataRowDiv")
Code explanation:
container # the variable where your data is stored, name it how you like
page_soup # your html page you souped with BeautifulSoup
findAll("tag",{"attribute":"value"}) # this function finds every tag which has the specific value inside its attribute
They are stored within the <script> tags in json format. Just a matter of pulling those out and parsing:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
url = 'https://flo.uri.sh/visualisation/4540617/embed'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'var _Flourish_data_column_names = ' in script.text:
json_str = script.text
col_names = json_str.split('var _Flourish_data_column_names = ')[-1].split(',\n')[0]
cols = json.loads(col_names)
data = json_str.split('_Flourish_data = ')[-1].split(',\n')[0]
loop=True
while loop == True:
try:
jsonData = json.loads(data)
loop = False
break
except:
data = data.rsplit(';',1)[0]
rows = []
headers = cols['rows']['columns']
for row in jsonData['rows']:
rows.append(row['columns'])
table = pd.DataFrame(rows,columns=headers)
for col in headers[1:]:
table.loc[table[col] != '', col] = 'A'
Output:
print (table)
Company Climate change Forests Water security
0 Danone A A A
1 FIRMENICH SA A A A
2 FUJI OIL HOLDINGS INC. A A A
3 HP Inc A A A
4 KAO Corporation A A A
.. ... ... ... ...
308 Woolworths Limited A
309 Workspace Group A
310 Yokogawa Electric Corporation A A
311 Yuanta Financial Holdings A
312 Zalando SE A
[313 rows x 4 columns]

Beautiful Soup scrape table with table breaks

I'm trying to scrape a table into a dataframe. My attempt only returns the table name and not the data within rows for each region.
This is what i have so far:
from bs4 import BeautifulSoup as bs4
import requests
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
table_regions = soup.find('table', {'class': "t4"})
regions = table_regions.find_all('tr')
for row in regions:
print row
ideal outcome i'd like to get:
region | price
---------------|-------
new england | 2.59
new york city | 2.52
Thanks for any assistance.
If you check your html response (soup) you will see that the table tag you get in this line table_regions = soup.find('table', {'class': "t4"}) its closed up before the rows that contain the information you need (the ones that contain the td's with the class names: up dn d1 and s1.
So how about using the raw td tags like this:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
a = soup.find_all('tr')
rows = []
subel = []
for tr in a[42:50]:
b = tr.find_all('td')
for td in b:
subel.append(td.string)
rows.append(subel)
subel = []
df = pd.DataFrame(rows, columns=['Region','Price_1', 'Percent_change_1', 'Price_2', 'Percent_change_2', 'Spark Spread'])
Notice that I use just the a[42:50] slice of the results because a contains all the td's of the website. You can use the rest too if you need to.

How to scrape websites with Python and beautiful soup

I am trying to scrape results from the bbc sport website. I've got the scores working but when trying to add team names the program prints out none 1-0 none (for example). This is the code:
from bs4 import BeautifulSoup
import urllib.request
import csv
url = 'http://www.bbc.co.uk/sport/football/teams/derby-county/results'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
for match in soup.select('table.table-stats tr.report'):
team1 = match.find('span', class_='team-home')
team2 = match.find('span', class_='team-away')
score = match.abbr
print(team1.string, score.string, team2.string)
It looks like you are searching for tags that are not there. For instance class_="team-home teams" is in the html, but class_='team-home' is not. The following code prints the first team name:
tables = soup.find_all("table", class_="table-stats")
tables[0].find("span", class_="team-home teams").text
# u' Birmingham '
Here is a possible solution which gets the home and away team names, the final score, the match date and the competition name via BeautifulSoup and puts it in a DataFrame.
import requests
import pandas as pd
from bs4 import BeautifulSoup
#Get the relevant webpage set the data up for parsing
url = "http://www.bbc.co.uk/sport/football/teams/derby-county/results"
r = requests.get(url)
soup=BeautifulSoup(r.content,"lxml")
#set up a function to parse the "soup" for each category of information and put it in a DataFrame
def get_match_info(soup,tag,class_name,column_name):
info_array=[]
for info in soup.find_all('%s'%tag,attrs={'class':'%s'%class_name}):
info_array.append({'%s'%column_name:info.text})
return pd.DataFrame(info_array)
#for each category pass the above function the relevant information i.e. tag names
date = get_match_info(soup,"td","match-date","Date")
home_team = get_match_info(soup,"span","team-home teams","Home Team")
score = get_match_info(soup,"span","score","Score")
away_team = get_match_info(soup,"span","team-away teams","Away Team")
competition = get_match_info(soup,"td","match-competition","Competition")
#Concatenate the DataFrames to present a final table of all the above info
match_info = pd.concat([date,home_team,score,away_team,competition],ignore_index=False,axis=1)
print match_info

Categories

Resources