Parsing date with BeautifulSoup - python

I'm getting info from a page with BeautifulSoup and I obtained the link:
[<span class="field-content">Friday, September 11, 2015</span>]
with the commands
links = soup.find_all('div', attrs={'class':'views-row'})
link = links[0]
link.find('span', attrs={'class':'views-field views-field-created'}).select('span')
but I need to parse the date. How can I get Friday, September 11, 2015 out of this?

I've found it, it's link.find('span', attrs={'class':'views-field views-field-created'}).select_one('span').text

To answer on your example from the question - Pick the last element from your resultset:
link.find('span', attrs={'class':'views-field views-field-created'}).select('span')[-1].text
or shorter:
link.find_all("span")[-1].text
But if you want to extract all information and store as structured data, there would be a better approach with using stripped_strings.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures'
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
data = []
for item in soup.select('.view-content div'):
c = list(item.stripped_strings)
data.append({
'title':c[0],
'date':c[-1],
'url':item.a['href'].split('/',3)[-1]
})
print(data)
Output
[{'title': 'Kicks offs, sing offs, and pro ams', 'date': 'Friday, September 11, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/kicks-offs-sing-offs-and-pro-ams'}, {'title': 'Grand Finale of the Hampton Classic Horse Show', 'date': 'Tuesday, September 1, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/grand-finale-of-the-hampton-classic-horse-show'}, {'title': 'Riders, Spectators, Horses, and More ...', 'date': 'Wednesday, August 26, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/riders-spectators-horses-and-more'}, {'title': 'Artist and Writers (and Designers)', 'date': 'Thursday, August 20, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/artist-and-writers-and-designers'}, {'title': 'Garden Parties Kickoffs and Summer Benefits', 'date': 'Monday, August 17, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/garden-parties-kickoffs-and-summer-benefits'}, {'title': 'The Summer Set', 'date': 'Wednesday, August 12, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/the-summer-set'}, {'title': 'Midsummer Parties', 'date': 'Wednesday, August 5, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/midsummer-parties'}, {'title': 'The Watermill Center and The Parrish', 'date': 'Wednesday, July 29, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/the-watermill-center-and-the-parrish'}, {'title': 'Unconditional Love', 'date': 'Thursday, July 23, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/unconditional-love'}, {'title': "Women's Health, Boys & Girls, Cancer Research, and Just Plain Summer Fun", 'date': 'Friday, July 17, 2015', 'url': 'http://www.newyorksocialdiary.com/party-pictures/2015/womens-health-boys-girls-cancer-research-and-just-plain-summer-fun'},...]

Related

Scraping Table Data from Multiple Pages

So I think this is going to be complex...hoping someone is up for a challenge.
Basically, I'm trying to visit all HREF tags on a specific URL and then print their "profile-box" class into a Google Sheet.
I have a working example with a different link below. This code goes to each of the URLs, visits the Player Link, and then returns their associated data:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1DpasSS8yC1UX6WqAbkQ515BwEEjdDL-x74T0eTW8hLM')
worksheet = sh.get_worksheet(3)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th': 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
'https://basketball.realgm.com/dleague/players/2019',
'https://basketball.realgm.com/dleague/players/2018',
]
res = []
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
res = [*res, *data]
if res != []:
header = list(res[0].keys())
values = [
header, *[[e[k] if e.get(k) else "" for k in header] for e in res]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
RESULTS OF THIS CODE (CORRECT):
Secondarily - I have a working code that takes a separate URL, loops through 66 pages, and returns the table data:
import requests
import pandas as pd
url = 'https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc'
res = []
for count in range(1, 66):
# pd.read_html accepts a URL too so no need to make a separate request
df_list = pd.read_html(f"{url}/{count}")
res.append(df_list[-1])
pd.concat(res).to_csv('my data.csv')
This returns the table data from the URL and works perfectly:
So... this brings me to my current issue:
I'm trying to take this same link (https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc)
and repeat the same action as the first code.
Meaning, I want to visit each profile (on all 66 or x number of pages), and print the profile data just like in the first code.
I thought/hoped, I'd be able to just replace the original D League URLS with this URL and it would work - it doesn't. I'm a little confused why, because the table data seems to be the same set up?
I started trying to re-work this, but struggling. I have very basic code, but think I'm taking steps backwards:
import requests
from bs4 import BeautifulSoup
url = "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
profile_url = link.get("href")
profile_response = requests.get(profile_url)
profile_soup = BeautifulSoup(profile_response.text, "html.parser")
profile_box = profile_soup.find("div", class_="profileBox")
if profile_box:
print(profile_box)
Any thoughts on this? Like I said, ultimately trying to recreate the same action as the first script, just for the 2nd URL.
Thanks in advance.
You can actually largely use the same code that you used in your first example, with a slight modification to the first find_all loop. Instead of using a findall you can use a css selector to select all of the table cells that have the nowrap class then test if that cell has a decendant link, and then from there the rest of your function should work the same as before.
Here is an example:
import requests
from bs4 import BeautifulSoup
def get_links2(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.select('td.nowrap'):
a_tag = td.a
if a_tag:
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls2 = ["https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc"]
res2 = []
for url in urls2:
data=get_links2(url)
res2 = [*res2, *data]
print(res2)
OUTPUT:
[{'Name': 'Jaroslaw Zyskowski', 'URL': '/player/Jaroslaw-Zyskowski/Summary/32427', 'Current Team': 'Trefl Sopot', 'Born': 'Jul 16, 1992(30 years old)', 'Birthplace/Hometown': 'Wroclaw, Poland', 'Natio
nality': 'Poland', 'Height': '6-7 (201cm)Weight:220 (100kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Manuel Capicchioni', 'Draft Entry': '2014 NBA Draft', 'Drafted': 'Undrafted', '
Pre-Draft Team': 'Kotwica Kolobrzeg (Poland)'}, {'Name': 'Ferdinand Zylka', 'URL': '/player/Ferdinand-Zylka/Summary/76159', 'Full Name': 'Ferdinand Leontin Zylka', 'Current Team': 'Basic-Fit Brussels
Basketball', 'Born': 'Apr 11, 1998(24 years old)', 'Birthplace/Hometown': 'Berlin, Germany', 'Nationality': 'Germany', 'Height': '6-3 (191cm)Weight:170 (77kg)', 'Current NBA Status': 'Unrestricted Fre
e Agent', 'Draft Entry': '2020 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Mitteldeutscher BC (Germany)'}, {'Name': 'Dainius Zvinklys', 'URL': '/player/Dainius-Zvinklys/Summary/151962', 'Cu
rrent Team': 'BBG Herford', 'Born': 'Nov 27, 1990(32 years old)', 'Birthplace/Hometown': 'Kretniga, Lithuania', 'Nationality': 'Lithuania', 'Height': '6-8 (203cm)Weight:187 (85kg)', 'Current NBA Statu
s': 'Unrestricted Free Agent', 'Draft Entry': '2012 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Markuss Zvinis', 'URL': '/player/Markuss-Zvinis/Summary/183480', 'Current Team': 'BK Valmiera', 'Born
': 'Apr 26, 2005(17 years old)', 'Nationality': 'Latvia', 'Height': '6-4 (193cm)Weight:N/A', 'Current NBA Status': 'Draft Eligible in 2027', 'Draft Entry': '2027 NBA Draft'}, {'Name': 'Ivars Zvigrus',
'URL': '/player/Ivars-Zvigrus/Summary/204634', 'Current Team': 'Flyyingen BBK', 'Born': 'Oct 17, 1995(27 years old)', 'Birthplace/Hometown': 'Riga, Latvia', 'Nationality': 'Latvia', 'Height': '6-7 (2
01cm)Weight:204 (93kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Nikita Zverev', 'URL': '/player/Nikita-Zverev/Summary/3279
1', 'Current Team': 'Samara', 'Born': 'Apr 6, 1994(28 years old)', 'Nationality': 'Russia', 'Height': '6-10 (208cm)Weight:225 (102kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry':
'2016 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Khimki BC U18 (Russia)'}, {'Name': 'Fernando Zurbriggen', 'URL': '/player/Fernando-Zurbriggen/Summary/76271', 'Full Name': 'Fernando Zurbri
ggen', 'Current Team': 'Monbus Obradoiro', 'Born': 'Oct 20, 1997(25 years old)', 'Birthplace/Hometown': 'Santa Fe, Argentina', 'Nationality': 'Argentina', 'Height': '6-1 (185cm)Weight:190 (86kg)', 'Cu
rrent NBA Status': 'Unrestricted Free Agent', 'Agent': 'Franisco Javier Martin', 'Draft Entry': '2019 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Obras Sanitarias (Argentina)'}, {'Name': 'A
lejandro Zurbriggen', 'URL': '/player/Alejandro-Zurbriggen/Summary/42671', 'Current Team': 'Sant Antoni Ibiza Feeling', 'Born': 'Mar 18, 1995(27 years old)', 'Birthplace/Hometown': 'Santa Fe, Argentin
a', 'Nationality': 'Argentina', 'Height': '6-5 (196cm)Weight:N/A', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Franisco Javier Martin', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undr
afted', 'Pre-Draft Team': 'Regatas Corrientes (Argentina)'}, {'Name': 'Nejc Zupan', 'URL': '/player/Nejc-Zupan/Summary/41700', 'Current Team': 'KK Tajfun Sentjur', 'Born': 'Apr 12, 1996(26 years old)'
, 'Birthplace/Hometown': 'Koper, Slovenia', 'Nationality': 'Slovenia', 'Height': '6-8 (203cm)Weight:N/A', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Sead Galijasevic', 'Draft Entry': '
2018 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Sixt Primorska (Slovenia)'}, {'Name': 'Zhennian Zuo', 'URL': '/player/Zhennian-Zuo/Summary/92765', 'Current Team': 'Sichuan Blue Whales', 'B
orn': 'Jan 26, 1996(27 years old)', 'Nationality': 'China', 'Height': '6-8 (203cm)Weight:215 (98kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2018 NBA Draft', 'Drafted': 'Undr
afted'}, {'Name': 'Matija Zunic', 'URL': '/player/Matija-Zunic/Summary/156440', 'Current Team': 'HKK Zrinjski', 'Born': 'Jun 7, 1996(26 years old)', 'Nationality': 'Serbia', 'Height': '6-4 (193cm)Weig
ht:N/A', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2018 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Kyle Zunic', 'URL': '/player/Kyle-Zunic/Summary/107186', 'Current Team': '
Perth', 'Born': 'Mar 4, 1999(23 years old)', 'Birthplace/Hometown': 'Wollongong, Australia', 'Nationality': 'Australia', 'Height': '6-2 (188cm)Weight:195 (88kg)', 'Current NBA Status': 'Unrestricted F
ree Agent', 'Draft Entry': '2022 NBA Draft', 'Drafted': 'Undrafted', 'High School': 'Lake Ginniderra High School[Burnie, Tasmania (Australia)]'}, {'Name': 'Karlis Zunda', 'URL': '/player/Karlis-Zunda/
Summary/123596', 'Current Team': 'Betsafe/Liepaja', 'Born': 'Aug 28, 1997(25 years old)', 'Nationality': 'Latvia', 'Height': '6-6 (198cm)Weight:187 (85kg)', 'Current NBA Status': 'Unrestricted Free Ag
ent', 'Draft Entry': '2019 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Zhang Zuming', 'URL': '/player/Zhang-Zuming/Summary/83723', 'Current Team': 'Qingdao', 'Born': 'Jan 27, 1995(28 years old)', '
Nationality': 'China', 'Height': '6-9 (206cm)Weight:198 (90kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Ningbo Roc
kets (China)'}, {'Name': 'Otoniel Zulueta', 'URL': '/player/Otoniel-Zulueta/Summary/184006', 'Current Team': 'N/A', 'Nationality': 'Mexico', 'Height': '6-6 (198cm)Weight:205 (93kg)', 'Current NBA Stat
us': 'Unrestricted Free Agent', 'Draft Entry': '2019 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Nathan Zulemie', 'URL': '/player/Nathan-Zulemie/Summary/175816', 'Current Team': 'Espoirs Nanterre',
'Born': 'Sep 7, 2004(18 years old)', 'Nationality': 'France', 'Height': '5-8 (173cm)Weight:N/A', 'Current NBA Status': 'Draft Eligible in 2026', 'Draft Entry': '2026 NBA Draft'}, {'Name': 'Mantvydas
Zukauskas', 'URL': '/player/Mantvydas-Zukauskas/Summary/75749', 'Current Team': 'Vilkaviskio Perlas', 'Born': 'Oct 19, 1998(24 years old)', 'Birthplace/Hometown': 'Kaunas, Lithuania', 'Nationality': '
Lithuania', 'Height': '6-3 (191cm)Weight:185 (84kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2020 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Delikatesas Joniskis
(Lithuania)'}, {'Name': 'Eigirdas Zukauskas', 'URL': '/player/Eigirdas-Zukauskas/Summary/43242', 'Current Team': 'BC Wolves', 'Born': 'Jun 3, 1992(30 years old)', 'Birthplace/Hometown': 'Radviliskis,
Lithuania', 'Nationality': 'Lithuania', 'Height': '6-6 (198cm)Weight:190 (86kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2014 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft T
eam': 'Siauliai (Lithuania)'}, {'Name': 'Ivo Zukanovic', 'URL': '/player/Ivo-Zukanovic/Summary/171804', 'Current Team': 'KK Alkar', 'Born': 'Sep 1, 2002(20 years old)', 'Nationality': 'Croatia', 'Heig
ht': '6-3 (191cm)Weight:N/A', 'Current NBA Status': 'Draft Eligible in 2024', 'Draft Entry': '2024 NBA Draft'}, {'Name': 'Kjeld Zuidema', 'URL': '/player/Kjeld-Zuidema/Summary/168658', 'Current Team':
'Donar Groningen', 'Born': 'Jun 21, 2001(21 years old)', 'Birthplace/Hometown': 'Eexterzandvoort, Netherlands', 'Nationality': 'Netherlands', 'Height': '6-5 (196cm)Weight:198 (90kg)', 'Current NBA St
atus': 'Draft Eligible in 2023', 'Draft Entry': '2023 NBA Draft'}, {'Name': 'Ruben Zugno', 'URL': '/player/Ruben-Zugno/Summary/78457', 'Current Team': 'Zeus Energy Group Rieti', 'Born': 'Mar 20, 1996(
26 years old)', 'Birthplace/Hometown': 'Cantu, Italy', 'Nationality': 'Italy', 'Height': '6-1 (185cm)Weight:182 (83kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2018 NBA Draft
', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Acqua San Bernardo Cantu (Italy)'}, {'Name': 'Luka Zugic', 'URL': '/player/Luka-Zugic/Summary/172582', 'Current Team': 'KK Milenijum Podgorica', 'Born': '
Nov 22, 2000(22 years old)', 'Birthplace/Hometown': 'Podgorica, Montenegro', 'Nationality': 'Montenegro', 'Height': '6-5 (196cm)Weight:210 (95kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Dr
aft Entry': '2022 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Fedor Zugic', 'URL': '/player/Fedor-Zugic/Summary/128532', 'Current Team': 'Ratiopharm Ulm', 'Born': 'Sep 18, 2003(19 years old)', 'Bir
thplace/Hometown': 'Kotor, Montenegro', 'Nationality': 'Montenegro', 'Height': '6-6 (198cm)Weight:188 (85kg)', 'Current NBA Status': 'Draft Eligible in 2025', 'Agent': 'Rade Filipovich,David Mondress'
, 'Draft Entry': '2025 NBA Draft', 'Early Entry Info': '2022 Early Entrant(Withdrew)', 'Pre-Draft Team': 'Ratiopharm Ulm (Germany)'}, {'Name': 'Andrey Zubkov', 'URL': '/player/Andrey-Zubkov/Summary/25
944', 'Current Team': 'Zenit Saint Petersburg', 'Born': 'Jun 29, 1991(31 years old)', 'Birthplace/Hometown': 'Chelyabinsk, Russia', 'Nationality': 'Russia', 'Height': '6-9 (206cm)Weight:195 (88kg)', '
Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Obrad Fimic', 'Draft Entry': '2013 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Lokomotiv Kuban (Russia)'}, {'Name': 'Aleksandr Zubk
ov', 'URL': '/player/Aleksandr-Zubkov/Summary/183206', 'Current Team': 'Runa-2', 'Born': 'Apr 7, 2002(20 years old)', 'Nationality': 'Russia', 'Height': '5-11 (180cm)Weight:N/A', 'Current NBA Status':
'Draft Eligible in 2024', 'Draft Entry': '2024 NBA Draft'}, {'Name': 'Aitor Zubizarreta', 'URL': '/player/Aitor-Zubizarreta/Summary/39787', 'Current Team': 'Acunsa GBC', 'Born': 'Mar 6, 1995(27 years
old)', 'Birthplace/Hometown': 'Azpeitia, Spain', 'Nationality': 'Spain', 'Height': '6-4 (193cm)Weight:195 (88kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2017 NBA Draft', 'D
rafted': 'Undrafted', 'Pre-Draft Team': 'College of Idaho (Sr)'}, {'Name': 'Tomislav Zubcic', 'URL': '/player/Tomislav-Zubcic/Summary/2427', 'Current Team': 'London Lions', 'Born': 'Jan 17, 1990(33 ye
ars old)', 'Birthplace/Hometown': 'Zadar, Croatia', 'Nationality': 'Croatia', 'Height': '6-10 (208cm)Weight:230 (104kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Bill Duffy', 'Draft
Entry': '2012 NBA Draft', 'Early Entry Info': '2011 Early Entrant(Withdrew)', 'Drafted': 'Round 2, Pick 26, Toronto Raptors', 'Draft Rights Trade': 'TOR to OKC, Jun 30, 2015', 'Pre-Draft Team': 'KK C
ibona (Croatia)'}, {'Name': 'Jure Zubac', 'URL': '/player/Jure-Zubac/Summary/38326', 'Current Team': 'Belfius Mons-Hainaut', 'Born': 'Mar 15, 1995(27 years old)', 'Birthplace/Hometown': 'Mostar, Bosni
a and Herzegovina', 'Nationality': 'Bosnia and Herzegovina', 'Height': '6-8 (203cm)Weight:N/A', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undrafted'
, 'Pre-Draft Team': 'BC Siroki (Bosnia and Herzegovina)'}, {'Name': 'Peter Zsiros', 'URL': '/player/Peter-Zsiros/Summary/98310', 'Current Team': 'Zalakeramia-ZTE KK', 'Born': 'Jun 22, 1994(28 years ol
d)', 'Nationality': 'Hungary', 'Height': '6-7 (201cm)Weight:198 (90kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2016 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Harun Zrno
', 'URL': '/player/Harun-Zrno/Summary/188930', 'Current Team': 'OKK Spars Sarajevo', 'Born': 'Mar 1, 2004(18 years old)', 'Nationality': 'Bosnia and Herzegovina', 'Height': '6-6 (198cm)Weight:N/A', 'C
urrent NBA Status': 'Draft Eligible in 2026', 'Draft Entry': '2026 NBA Draft'}, {'Name': 'Evangelos Zougris', 'URL': '/player/Evangelos-Zougris/Summary/183106', 'Current Team': 'Peristeri BC', 'Born':
'Oct 14, 2004(18 years old)', 'Nationality': 'Greece', 'Height': '6-8 (203cm)Weight:N/A', 'Current NBA Status': 'Draft Eligible in 2026', 'Draft Entry': '2026 NBA Draft'}, {'Name': 'Vitaliy Zotov', '
URL': '/player/Vitaliy-Zotov/Summary/54539', 'Current Team': 'BC Budivelnik', 'Born': 'Mar 3, 1997(25 years old)', 'Birthplace/Hometown': 'Lozovaya, Ukraine', 'Nationality': 'Ukraine', 'Height': '6-2
(188cm)Weight:185 (84kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Misko Raznatovic', 'Draft Entry': '2019 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Jan Zorvan', 'URL': '/playe
r/Jan-Zorvan/Summary/108564', 'Current Team': 'MBK Lucenec', 'Born': 'Dec 22, 1995(27 years old)', 'Nationality': 'Slovakia', 'Height': '6-7 (201cm)Weight:208 (94kg)', 'Current NBA Status': 'Unrestric
ted Free Agent', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undrafted'}, {'Name': 'Kristers Zoriks', 'URL': '/player/Kristers-Zoriks/Summary/54343', 'Current Team': 'BC VEF Riga', 'Born': 'May 25, 1
998(24 years old)', 'Birthplace/Hometown': 'Dobele, Latvia', 'Nationality': 'Latvia', 'Height': '6-4 (193cm)Weight:190 (86kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2022 NB
A Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'BC VEF Riga (Latvia)', 'High School': 'New Hampton School[New Hampton, New Hampshire (United States)]'}, {'Name': 'Yovel Zoosman', 'URL': '/player/
Yovel-Zoosman/Summary/75937', 'Current Team': 'ALBA Berlin', 'Born': 'May 12, 1998(24 years old)', 'Birthplace/Hometown': 'Kfar Saba, Israel', 'Nationality': 'Israel', 'Height': '6-7 (201cm)Weight:198
(90kg)', 'Current NBA Status': 'Unrestricted Free Agent', 'Agent': 'Andrew Vye,Guillermo Bermejo,Brian Jungreis,Nadav Mor', 'Draft Entry': '2019 NBA Draft', 'Early Entry Info': '2019 Early Entrant',
'Drafted': 'Undrafted', 'Pre-Draft Team': 'Maccabi FOX Tel Aviv (Israel)'}, {'Name': 'Marcell Zoltan Volgyi', 'URL': '/player/Marcell-Zoltan-Volgyi/Summary/93730', 'Current Team': 'Budapesti Honved Se
', 'Born': 'Apr 22, 1998(24 years old)', 'Birthplace/Hometown': 'Nagykanizsa, Hungary', 'Nationality': 'Hungary', 'Height': '6-6 (198cm)Weight:200 (91kg)', 'Current NBA Status': 'Unrestricted Free Age
nt', 'Draft Entry': '2020 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Zalakeramia-ZTE KK (Hungary)'}, {'Name': 'Przemyslaw Zolnierewicz', 'URL': '/player/Przemyslaw-Zolnierewicz/Summary/531
22', 'Current Team': 'Enea Zastal BC Zielona', 'Born': 'Jul 3, 1995(27 years old)', 'Birthplace/Hometown': 'Paslek, Poland', 'Nationality': 'Poland', 'Height': '6-4 (193cm)Weight:200 (91kg)', 'Current
NBA Status': 'Unrestricted Free Agent', 'Agent': 'Rade Filipovich', 'Draft Entry': '2017 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'Asseco Arka Gdynia (Poland)'}, {'Name': 'Laurent Raphae
l Zoccoletti', 'URL': '/player/Laurent-Raphael-Zoccoletti/Summary/95274', 'Current Team': 'SAM Basket Massagno', 'Born': 'Nov 17, 1999(23 years old)', 'Birthplace/Hometown': 'Wettingen, Switzerland',
'Nationality': 'Switzerland', 'Height': '6-7 (201cm)Weight:N/A', 'Current NBA Status': 'Unrestricted Free Agent', 'Draft Entry': '2021 NBA Draft', 'Drafted': 'Undrafted', 'Pre-Draft Team': 'BBC Nyon (
Switzerland)'}, ....

Fill in missing values for missing dates in dataframe

I have the following dataframe:
df = pd.DataFrame(
{
'status': ['open', 'closed', 'open', 'closed', 'open', 'closed', 'open', 'closed'],
'month': ['January 2020', 'January 2020', 'February 2020', 'February 2020', 'April 2020', 'April 2020', 'August 2020', 'August 2020'],
'counts': [10, 12, 32, 12, 19, 40, 10, 11]
}
)
status month counts
0 open January 2020 10
1 closed January 2020 12
2 open February 2020 32
3 closed February 2020 12
4 open April 2020 19
5 closed April 2020 40
6 open August 2020 10
7 closed August 2020 11
I'm trying to get a stacked bar plot using seaborn:
sns.histplot(df, x='month', weights='counts', hue='status', multiple='stack')
The purpose is to get a plot with a continuous timeseries without missing months. How can I fill in the missing rows with values so that the dataframe would look like below?
status month counts
open January 2020 10
closed January 2020 12
open February 2020 32
closed February 2020 12
open March 2020 0
closed March 2020 0
open April 2020 19
closed April 2020 40
open May 2020 0
closed May 2020 0
open June 2020 0
closed June 2020 0
open July 2020 0
closed July 2020 0
open August 2020 10
closed August 2020 11
You could pivot the dataframe, and then reindex with the desired months.
import pandas as pd
df = pd.DataFrame({'status': ['open', 'closed', 'open', 'closed', 'open', 'closed', 'open', 'closed'],
'month': ['January 2020', 'January 2020', 'February 2020', 'February 2020', 'April 2020', 'April 2020', 'August 2020', 'August 2020'],
'counts': [10, 12, 32, 12, 19, 40, 10, 11]})
months = [f'{m} 2020' for m in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']]
df_pivoted = df.pivot(values='counts', index='month', columns='status').reindex(months).fillna(0)
ax = df_pivoted.plot.bar(stacked=True, width=1, ec='black', rot=0, figsize=(12, 5))
A seaborn solution, could use order=. That doesn't work with a histplot, only with a barplot, which doesn't stack bars.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'status': ['open', 'closed', 'open', 'closed', 'open', 'closed', 'open', 'closed'],
'month': ['January 2020', 'January 2020', 'February 2020', 'February 2020', 'April 2020', 'April 2020', 'August 2020', 'August 2020'],
'counts': [10, 12, 32, 12, 19, 40, 10, 11]})
plt.figure(figsize=(12, 5))
months = [f'{m} 2020' for m in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']]
ax = sns.barplot(data=df, x='month', y='counts', hue='status', order=months)
plt.tight_layout()
plt.show()

set and sort function in loop

I need to be able to use function (get_years) to iterate a list of reviews such as:
{'rating': 5.0,
'reviewer_name': 'Karen',
'product_id': 'B00004RFRV',
'review_title': 'Bialetti is the Best!',
'review_time': '11 12, 2017',
'images': ['https://images-na.ssl-images-amazon.com/images/I/81+XxFRGyBL._SY88.jpg'],
'styles': {'Size:': ' 12-Cup', 'Color:': ' Silver'}}```
{'rating': 3.0,
'reviewer_name': 'Peter DP',
'product_id': 'B00005OTXM',
'review_title': "Mr. Coffee DWX23 12-cup doesn't have the quality feel as my 13 year old nearly identical 12-cup Mr. Coffee",
'review_time': '04 17, 2015',
'images': ['https://images-na.ssl-images-amazon.com/images/I/71sFKwTW9sL._SY88.jpg'],
'styles': {'Style Name:': ' COFFEE MAKER ONLY'}}
{'rating': 5.0,
'reviewer_name': 'B. Laska',
'product_id': 'B00004RFRV',
'review_title': 'Love my Moka pots!',
'review_time': '07 9, 2015',
'images': ['https://images-na.ssl-images-amazon.com/images/I/719NCqw4GML._SY88.jpg'],
'styles': {'Size:': ' 1-Cup', 'Color:': ' Silver'}}
to be able to return:
print(get_years(reviews)) # [2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
print(type(get_years(reviews))) # <class 'list'>
I have:
def get_years(review):
review_years_set = set()
for review in reviews:
review_years_set.add(review['review_time'][-4:])
review_years_list = list(review_years_set)
review_years_list.sort()
return review_years_list
which gives me what I want but it seems like the longer route. Is there a more Pythonic or efficient way to get a sorted list of set values?
Given an iterable of string-formatted dates, e.g.:
dates = ['07 9, 2007', '04 1, 2008', '01 2, 2007', '08 2, 2014', '01 3, 2004', '01 4, 2004']
A concise way to produce a sorted list of unique years is as follows using set comprehension:
sorted_dates = sorted({int(date[-4:]) for date in dates})
print(sorted_dates)
Output:
[2004, 2007, 2008, 2014]
try this.
def get_years(reviews):
return sorted([review['review_time'][-4:] for review in reviews])
print(get_years(reviews))

BeautifulSoup webpage scraping

I am trying to scrap a webpage.
from bs4 import BeautifulSoup
import requests
page = requests.get('https://www.mql5.com/en/economic-calendar/united-states')
soup = BeautifulSoup(page.content, 'html.parser')
calender = soup.find(id="economicCalendarTable")
items = calender.find_all(class_="ec-table__title")
print(items)
However, it prints an empty list, although in the webpage there are many entries with tag "class_="ec-table__title". What I found is that the tags inside "id="economicCalendarTable" tag are just in one line (very long). So, "calender.find_all" skips everything.
I am trying to get all tages inside 'id="economicCalendarTable"'.
Is there way to do this?
You can use selenium:
from selenium import webdriver
import re
from bs4 import BeautifulSoup as soup
d = webdriver.Chrome()
d.get('https://www.mql5.com/en/economic-calendar/united-states')
s = soup(d.page_source, 'lxml')
time = s.find('span', {'id':'economicCalendarTableColumnTime'}).text
title = s.find('div', {'class':'ec-table__title'}).text
classes = ['ec-table__col_time', 'ec-table__curency-name', 'ec-table__col_event', 'ec-table__col_forecast', 'prevValue']
full_data = [[i.find('div', {'class':c if c != 'prevValue' else re.compile('prevValue\d+')}) for c in classes] for i in s.find_all('div', {'class':'ec-table__item'})]
new_results = [dict(zip(['time', 'name', 'event', 'forcast', 'prevous_value'], [getattr(i, 'text', '') for i in b])) for b in full_data]
Output:
[{'event': u'Chicago Fed National Activity Index', 'forcast': u'0.14', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Markit Manufacturing PMI', 'forcast': u'56.4', 'name': u'USD', 'prevous_value': '', 'time': u' 09:45'}, {'event': u'Markit Services PMI', 'forcast': u'55', 'name': u'USD', 'prevous_value': '', 'time': u' 09:45'}, {'event': u'Markit Composite PMI', 'forcast': u'55', 'name': u'USD', 'prevous_value': '', 'time': u' 09:45'}, {'event': u'New Home Sales m/m', 'forcast': u'-1.2%', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'New Home Sales', 'forcast': u'0.639 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'EIA Crude Oil Stocks Change', 'forcast': u'-1.791 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Cushing Crude Oil Stocks Change', 'forcast': u'0.259 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Crude Oil Imports Change', 'forcast': u'-0.32 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Distillate Fuel Production Change', 'forcast': u'-0.011 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Distillates Stocks Change', 'forcast': u'-0.182 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Gasoline Production Change', 'forcast': u'0.289 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Heating Oil Stocks Change', 'forcast': u'-0.026 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'EIA Gasoline Stocks Change', 'forcast': u'-3.206 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'FOMC Minutes', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 14:00'}, {'event': u'Continuing Jobless Claims', 'forcast': u'1.769 M', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Initial Jobless Claims', 'forcast': u'216 K', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Initial Jobless Claims 4-Week Average', 'forcast': u'213.814 K', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'HPI m/m', 'forcast': u'0.5%', 'name': u'USD', 'prevous_value': '', 'time': u' 09:00'}, {'event': u'Existing Home Sales', 'forcast': u'5.45 M', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Existing Home Sales m/m', 'forcast': u'0.3%', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'EIA Natural Gas Storage Change', 'forcast': u'92 B', 'name': u'USD', 'prevous_value': '', 'time': u' 10:30'}, {'event': u'Durable Goods Orders m/m', 'forcast': u'-0.3%', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Core Durable Goods Orders m/m', 'forcast': u'0.0%', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Durable Goods Orders excl. Defense m/m', 'forcast': u'-6.2%', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Nondefense Capital Goods Orders excl. Aircraft m/m', 'forcast': u'0.3%', 'name': u'USD', 'prevous_value': '', 'time': u' 08:30'}, {'event': u'Fed Chair Powell Speech', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 09:20'}, {'event': u'Michigan Consumer Sentiment', 'forcast': u'98.5', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Michigan Consumer Expectations', 'forcast': u'88.9', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Michigan Current Conditions', 'forcast': u'112.9', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Michigan Inflation Expectations', 'forcast': u'2.7%', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Michigan 5-Year Inflation Expectations', 'forcast': u'2.5%', 'name': u'USD', 'prevous_value': '', 'time': u' 10:00'}, {'event': u'Baker Hughes US Oil Rig Count', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 13:00'}, {'event': u'CFTC Copper Non-Commercial Net Positions', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 15:30'}, {'event': u'CFTC Crude Oil Non-Commercial Net Positions', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 15:30'}, {'event': u'CFTC S&P 500 Non-Commercial Net Positions', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 15:30'}, {'event': u'CFTC Gold Non-Commercial Net Positions', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 15:30'}, {'event': u'CFTC Silver Non-Commercial Net Positions', 'forcast': u'', 'name': u'USD', 'prevous_value': '', 'time': u' 15:30'}]
Here's a simple example I've put together using Selenium and BeautifulSoup:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Selenium part
browser = webdriver.Chrome()
browser.get('https://www.mql5.com/en/economic-calendar/united-states')
# BeautifulSoup part
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
calender = soup.find(id="economicCalendarTable")
items = calender.find_all(class_="ec-table__title")
print(items)
This code will allow you to download the page entirely and then pass the complete html source to BS
Be sure to install Selenium and the ChromeDriver correctly before running this script.
There is no item with class ec-table__title in the base html of that page.
However, it does appear when using a dom inspector in the browser. I am afraid this is a sure sign that it has been inserted into the DOM by javascript ad indeed there is some javascript invoked by that webpage.
May I suggest that you investigate using the selenium module in conjunction with BeautifulSoup?

escape string in mysql insert query

am inserting a list of dictionaries in a mysql table with the code below, but having issues on how to escape ',' and other potential error...
data=[{'index': 1, 'kite_size': u'5', 'source': 'spainkiters', 'type': 'kite', 'brand': u'rrd', 'title': u'RRD Religion 2014 10\xb45m usada solo 1 vez', 'id': u'112506', 'kite_model': u'religion', 'location': u' santa pola', 'year': u'2014', 'date_added': u' Lun Ene 27, 2014 9:52 am', 'quality': 5, 'price': None}, {'index': 1, 'kite_size': u'10', 'source': 'spainkiters', 'quality': 5, 'price': u'750', 'title': u'Vendo Kite completo Nobile Fifty 10M 2013', 'id': u'112762', 'kite_model': u'fifty', 'location': u'', 'year': u'2013', 'date_added': u' Mar Feb 11, 2014 5:38 pm', 'type': 'kite', 'brand': u'nobile'}, {'index': 1, 'kite_size': u'7', 'source': 'spainkiters', 'quality': 5, 'price': None, 'title': u'NORTH EVO 7m 2013 !!!!!!!!!!!!!', 'id': u'112789', 'kite_model': u'evo', 'location': u'', 'year': u'2013', 'date_added': u' Mie Feb 12, 2014 9:08 pm', 'type': 'kite', 'brand': u'north'}, {'index': 1, 'kite_size': u'5', 'source': 'spainkiters', 'quality': 4, 'price': u'350', 'title': u'cabrinha convers 2012 5m y tabla slingshot lunaci 1,28cm', 'id': u'112767', 'kite_model': None, 'location': u' pais vasco', 'year': u'2012', 'date_added': u' Mar Feb 11, 2014 6:41 pm', 'type': 'kite', 'brand': u'cabrinha'}, {'index': 1, 'kite_size': u'5', 'source': 'spainkiters', 'quality': 4, 'price': u'350', 'title': u'cabrinha convers 2012 5m y tabla slingshot lunaci 1,28cm', 'id': u'112766', 'kite_model': None, 'location': u' pais vasco', 'year': u'2012', 'date_added': u' Mar Feb 11, 2014 6:33 pm', 'type': 'kite', 'brand': u'cabrinha'}, {'index': 1, 'kite_size': u'8', 'source': 'spainkiters', 'type': 'kite', 'brand': u'flexifoil', 'title': u'flexifoil HADLOW ID 8 metros precio negociable', 'id': u'112512', 'kite_model': u'hadlow', 'location': u' Gran Canaria', 'year': None, 'date_added': u' Lun Ene 27, 2014 5:08 pm', 'quality': 5, 'price': None}, {'index': 1, 'kite_size': u'11', 'source': 'spainkiters', 'quality': 5, 'price': u'500', 'title': u'VENDO SWITCH COMBAT 11M A\xd1O 2013...500E ENVIO INCLUIDO!!', 'id': u'112773', 'kite_model': u'combat', 'location': u'', 'year': u'2013', 'date_added': u' Mar Feb 11, 2014 9:11 pm', 'type': 'kite', 'brand': u'switch'}]
cursor.executemany("INSERT INTO search_kite_products_4 (brand, kite_model, kite_size, year, source, id, location, title, date_added, quality, price, link_id) VALUES (%(brand)s, %(kite_model), %(kite_size)s, %(year)s, %(source)s, %(id)s, %(location)s, %%(title)s%, %%(date_added)s%, %(quality)s, %(price)s, %(id)s)", data)
db.commit()
db.close()
I tried to add an additional % before the fields which could have such characters but it did not work...
Traceback (most recent call last):
File "C:\Program Files (x86)\JetBrains\PyCharm Community Edition 3.1.1\helpers\pydev\pydevd.py", line 1534, in <module>
debugger.run(setup['file'], None, None)
File "C:\Program Files (x86)\JetBrains\PyCharm Community Edition 3.1.1\helpers\pydev\pydevd.py", line 1145, in run
pydev_imports.execfile(file, globals, locals) #execute the script
File "C:/Users/Joao/PycharmProjects/Olympia/olympiaspt/scripts/test_sql.py", line 30, in <module>
cursor.executemany("INSERT INTO search_kite_products_4 (brand, kite_model, kite_size, year, source, id, location, title, date_added, quality, price, link_id) VALUES (%(brand)s, %(kite_model), %(kite_size)s, %(year)s, %(source)s, %(id)s, %(location)s, %%(title)s%, %%(date_added)s%, %(quality)s, %(price)s, %(id)s)", data)
File "C:\Python27\lib\site-packages\MySQLdb\cursors.py", line 252, in executemany
self.errorhandler(self, exc, value)
File "C:\Python27\lib\site-packages\MySQLdb\connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
ValueError: unsupported format character ',' (0x2c) at index 25
You missed out an s:
%(kite_model),
should be
%(kite_model)s,
You also added % characters too many here:
%%(title)s%, %%(date_added)s%
that should be just:
%(title)s, %(date_added)s
If you need to insert literal % characters around your column data, add these to your Python strings or use SQL functions to explicitly concatenate these server side.

Categories

Resources