Beautiful soup select two item with parent child realtionship

Beautiful soup select two item with parent child realtionship - python

The code below finds all the links with gameId and puts the links in a dataframe. My issue is that I am not sure how I store them in a dataframe with the corresponding date. In this case the h2 is the parent tag with the child tag having the links. The code below get the links but how get date for each gameId.
import pandas as pd
import requests
from bs4 import BeautifulSoup
gmdf = pd.DataFrame(columns=['link','gamedate'])
url = 'https://www.espn.com/nfl/schedule/_/week/1/year/2020'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('a')
for link in links:
if 'gameId' in link.get('href'):
print(link.get('href'))
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink}, ignore_index=True)
This line to get the dates on the page but I need the dates with corresponding gameid in the data frame.
soup.select('h2')

Here is an alternative method from what Dhivakar has already provided. In it, I add the h2 tag in the original selection by BeautifulSoup, then, I set the date based on when the link does not have an href, since we know it must either be h2 or a tag, and h2 tags contain the dates.
import pandas as pd
import requests
from bs4 import BeautifulSoup
gmdf = pd.DataFrame(columns=['link','gamedate'])
url = 'https://www.espn.com/nfl/schedule/_/week/1/year/2020'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('a, h2')
date = ""
for link in links:
if link.get('href') is None:
date = link.text
print(date)
elif link.get('href') is not None and 'gameId' in link.get('href'):
print(date)
print(link.get('href'))
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink, 'gamedate': date}, ignore_index=True)
print(gmdf)
Output:

You can simply use a nested loop within a list comprehension to loop those date headers then find the next table and grab the list of href containing the substring of interest (gameId). That way you can have the relevant date listed against each link in a list of tuples you convert to a DataFrame:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://www.espn.com/nfl/schedule/_/week/1/year/2020')
soup = bs(r.text, 'lxml')
df = pd.DataFrame([('https://www.espn.com' + j['href'], i.text) for i in soup.select(
'#sched-container .table-caption') for j in i.find_next('table').select('[href*=gameId]')], columns=['link', 'date'])
print(df)

You can grab parent and siblings of elements just like in JavaScript.
Replace this after links = soup.select('a'),
schedule_year = soup.select_one('.automated-header h1').text.split("- ")[-1] # For the schedule year
for link in links:
if 'gameId' in link.get('href'):
schedule_date = link.parent.parent.parent.parent.parent.previous_sibling.text.split(", ")[-1] + " " + schedule_year # Grabs the h2 tag
schedule_date = datetime.datetime.strptime(schedule_date, "%B %d %Y") # Converted the date to datetime object for manipulation
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink, 'gamedate': schedule_date}, ignore_index=True)

Related

How to extract data from dropdown list using python

I am trying to extract the data from drop down using python.
link:
From that,
State
Category
District
CTSO
Division
Map Type
mentions so, I want to extract data from all by selecting one by one using beautifulsoup or request library.
In my code, I gave district name statically but I want to select it one by one and extract data from it.
I tried but it doesn't work
`
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
list = ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]
url = "http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName="
for lists in list:
urls= url+lists
# print(urls)
response = requests.get(urls)
# print(response)
soup = BeautifulSoup(response.text, "html.parser")
# print(soup)
# soups= soup.find_all("div", {"id": "level_text_2"})
# print(soups)
# for ids in soup.find_all(attrs={'id': 'location_table'}):
# print(ids)
# ids = ids.text.strip()
# print(ids)
for option in soup.find_all('option'):
print(option.text)
for tag in soup.find_all(class_="panel-body"):
# print(tag.get('ctl00_ContentPlaceHolder5_ddlDistrict'))
print(tag)
`
I want:
District name
All taluka names
and all villages name. like that

Something like this
import requests
from bs4 import BeautifulSoup
for dist_name in ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]:
print('-- {} --'.format(dist_name))
r = requests.get('http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName={}'.format(dist_name))
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
select_list = soup.find_all('select')
for select in select_list:
print('Select name: ' + select.attrs['name'])
option_list = select.find_all('option')
for option in option_list:
print('\t option: ' + option.attrs['value'])

Extracting 'href' under 'a' tag

I'm trying to extract a link under "a href="link"..."
As there are multiple rows I iterate over every one of them. The first link per row is the one I need so I use find_all('tr') and find('a').
I know find('a') returns a Nonetype but do not know how to work around this
I had a piece of code that worked but is inefficient (in comments).
sauce = urllib.request.urlopen('https://morocco.observation.org/soortenlijst_wg_v3.php')
soup = bs.BeautifulSoup(sauce, 'lxml')
tabel = soup.find('table', {'class': 'tablesorter'})
for i in tabel.find_all('tr'):
# if 'view' in i.get('href'):
# link_list.append(i.get('href'))
link = i.find('a')
#<a class="z1" href="/soort/view/164?from=1987-12-05&to=2019-05-31">Common Reed Bunting - <em>Emberiza schoeniclus</em></a>
How do I retrieve the link under href and work around the Nonetype getting only /soort/view/164?from=1987-12-05&to=2019-05-31?
Thanks in advance

A logical way is to use nth-of-type to isolate the target column
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://morocco.observation.org/soortenlijst_wg_v3.php')
soup = bs(r.content, 'lxml')
base = 'https://morocco.observation.org'
urls = [base + item['href'] for item in soup.select('#mytable_S td:nth-of-type(3) a')]
You could also pass a list of classes
urls = [base + item['href'] for item in soup.select('.z1, .z2,.z3,.z4')]
Or even use starts with, ^, operator for class
urls = [base + item['href'] for item in soup.select('[class^=z]')]
Or contains, *, operator for href
urls = [base + item['href'] for item in soup.select('[href*=view]')]
Read about different css selector methods here: https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors

link = i.find('a')
_href = link['href']
print(_href)
O/P:
"/soort/view/164?from=1987-12-05&to=2019-05-31?"
This is not proper url link, you should concate with domain name
new_url = "https://morocco.observation.org"+_href
print(new_url)
O/p:
https://morocco.observation.org/soort/view/164?from=1987-12-05&to=2019-05-31?
Update:
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
resp = requests.get("https://morocco.observation.org/soortenlijst_wg_v3.php")
soup = BeautifulSoup(resp.text, 'lxml')
tabel = soup.find('table', {'class': 'tablesorter'})
base_url = "https://morocco.observation.org"
for i in tabel.find_all('tr'):
link = i.find('a',href=True)
if link is None or not isinstance(link,Tag):
continue
url = base_url + link['href']
print(url)
O/P:
https://morocco.observation.org/soort/view/248?from=1975-05-05&to=2019-06-01
https://morocco.observation.org/soort/view/174?from=1989-12-15&to=2019-06-01
https://morocco.observation.org/soort/view/57?from=1975-05-05&to=2019-06-01
https://morocco.observation.org/soort/view/19278?from=1975-05-13&to=2019-06-01
https://morocco.observation.org/soort/view/56?from=1993-03-25&to=2019-06-01
https://morocco.observation.org/soort/view/1504?from=1979-05-25&to=2019-06-01
https://morocco.observation.org/soort/view/78394?from=1975-05-09&to=2019-06-01
https://morocco.observation.org/soort/view/164?from=1987-12-05&to=2019-06-01

Best way to loop this situation?

I have a list of divs, and I'm trying to get certain info in each of them. The div classes are the same so I'm not sure how I would go about this.
I have tried for loops but have been getting various errors
Code to get list of divs:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://sneakernews.com/release-dates/'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, "lxml")
soup1 = soup.find("div", {'class': 'popular-releases-block'})
soup1 = str(soup1.find("div", {'class': 'row'}))
soup1 = soup1.split('</div>')
print(soup1)
Code I want to loop for each item in the soup1 list:
linkinfo = soup1.find('a')['href']
date = str(soup1.find('span'))
name = soup1.find('a')
non_decimal = re.compile(r'[^\d.]+')
date = non_decimal.sub('', date)
name = str(name)
name = re.sub('</a>', '', name)
link, name = name.split('>')
link = re.sub('<a href="', '', link)
link = re.sub('"', '', link)
name = name.split(' ')
name = str(name[-1])
date = str(date)
link = str(link)
print(link)
print(name)
print(date)

Based on the URL you posted above, I imagine you are interested in something like this:
import requests
from bs4 import BeautifulSoup
url = requests.get('https://sneakernews.com/release-dates/').text
soup = BeautifulSoup(url, 'html.parser')
tags = soup.find_all('div', {'class': 'col lg-2 sm-3 popular-releases-box'})
for tag in tags:
link = tag.find('a').get('href')
print(link)
print(tag.text)
#Anything else you want to do
If you are using the BeautifulSoup library, then you do not need regex to try to parse through HTML tags. Instead, use the handy methods that accompany BeautifulSoup. If you would like to apply a regex to the text output from the tags you locate via BeautifulSoup to accomplish a more specific task, then that would be reasonable.

My understanding is that you want to loop your code for each item within a list.
An example of this:
my_list = ["John", "Fred", "Tom"]
for name in my_list:
print(name)
This will loop for each name that is in my_list and print out each item (reffered to here as name in the list). You could do something similar with your code:
for item in soup1:
# perform some action

HTML Parsing using bs4

I am parsing an HTMl page and am having a hard time figuring out how to pull a certain 'p' tag without a class or on id. I am trying to reach the tag of 'p' with the lat and long. Here is my current code:
import bs4
from urllib import urlopen as uReq #this opens the URL
from bs4 import BeautifulSoup as soup #parses/cuts the html
my_url = 'http://www.fortwiki.com/Battery_Adair'
print(my_url)
uClient = uReq(my_url) #opens the HTML and stores it in uClients
page_html = uClient.read() # reads the URL
uClient.close() # closes the URL
page_soup = soup(page_html, "html.parser") #parses/cuts the HTML
containers = page_soup.find_all("table")
for container in containers:
title = container.tr.p.b.text.strip()
history = container.tr.p.text.strip()
lat_long = container.tr.table
print(title)
print(history)
print(lat_long)
Link to website: http://www.fortwiki.com/Battery_Adair

The <p> tag you're looking for is very common in the document, and it doesn't have any unique attributes, so we can't select it directly.
A possible solution would be to select the tag by index, as in bloopiebloopie's answer.
However that won't work unless you know the exact position of the tag.
Another possible solution would be to find a neighbouring tag that has distinguishing attributes/text and select our tag in relation to that.
In this case we can find the previous tag with text: "Maps & Images", and use find_next to select the next tag.
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
b = soup.find('b', text='Maps & Images')
if b:
lat_long = b.find_next().text
This method should find the coordinates data in any www.fortwiki.com page that has a map.

You can use re to match partial text inside a tag.
import re
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
lat_long = soup.find('p', text=re.compile('Lat:\s\d+\.\d+\sLong:')).text
print(lat_long)
# Lat: 24.5477038 Long: -81.8104541

I am not exactly sure what you want but this works for me. There are probably neeter ways of doing it. I am new to python
soup = BeautifulSoup(requests.get("http://www.fortwiki.com/Battery_Adair").content, "html.parser")
x = soup.find("div", id="mw-content-text").find("table").find_all("p")[8]
x = x.get_text()
x = x.split("Long:")
lat = x[0].split(" ")[1]
long = x[1]
print("LAT = " + lat)
# LAT = 24.5477038
print("LNG = " + long)
# LNG = -81.8104541

Extract links from html page

I am trying to fetch all movie/show netflix links from here http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html and also their country name. e.g from the page source, I want http://www.netflix.com/WiMovie/80048948, USA, etc. I have done the following. But it returns all links instead of the netflix ones I want. I am a little new to regex. How should I go about this?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
##reqlink = re.search('netflix',link.get('href'))
##if reqlink:
print link.get('href')
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
If I uncomment the lines above, I get the following error:
TypeError: expected string or buffer
What should I do?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
title=[]
country=[]
for line in r.iter_lines():
if count == 746:
urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(line)
for link in soup.findAll('a', href=re.compile('netflix')):
title.append(link.get('href'))
for link in soup.findAll('img'):
print link.get('alt')
country.append(link.get('alt'))
count = count + 1
print len(title), len(country)
The previous error has been worked upon. Now the only thing to look for is films with multiple countries. How to get them together.
e.g. for 10.0 Earthquake, link = http://www.netflix.com/WiMovie/80049286, country = UK, USA.

Your code can be simplified to a couple of selects:
import requests
from bs4 import BeautifulSoup
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
for a in soup.select("a[href*=netflix]"):
print(a["href"])
And for the img:
co = {"UK", "USA"}
for img in soup.select("img[alt]"):
if img["alt"] in co:
print(img)

As for the first question - it failed for links that didn't have an href value. So instead of a string you got None.
The following works:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/
07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
link_href = link.get('href')
if link_href:
reqlink = re.search('netflix',link_href)
if reqlink:
print link_href
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
As for the second question, I would recommend having a dictionary between the movie to a list of countries that it appears in, then it would be easier to format it in a string the way you want.

I think you'd have an easier iterating through the listing rows and using a generator to assemble the data structure you're looking for (ignore the minor differences in my code, I'm using Python3):
from bs4 import BeautifulSoup
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/' \
'complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
rows = soup.select('span[class="listings"] tr')
def get_movie_info(rows):
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
yield link, countries
print('\n'.join(map(str, get_movie_info(rows))))
Edit: Or if you're looking for a dict instead of a list:
def get_movie_info(rows):
output = {}
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
name = link.text
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
output[name or 'some_default'] = {'link': link, 'countries': countries}
return output
print('\n'.join(map(str, get_movie_info(rows).items())))

url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
final=[]
for line in r.iter_lines():
if count == 746:
soup = BeautifulSoup(line)
for row in soup.findAll('tr'):
url = row.find('a', href=re.compile('netflix'))
if url:
t=url.string
u=url.get('href')
one=[]
for country in row.findAll('img'):
one.append(country.get('alt'))
final.append({'Title':t,'Url':u,'Countries':one})
count = count + 1
final is the final list.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautiful soup select two item with parent child realtionship - python

Related

How to extract data from dropdown list using python

Extracting 'href' under 'a' tag

Best way to loop this situation?

HTML Parsing using bs4

Extract links from html page

Categories

Resources