find data dictionary behind URL

find data dictionary behind URL - python

Find data dictionary behind URL
https://www.coingecko.com/fr/pi%C3%A8ces/1/markets_tab --> BTC
https://www.coingecko.com/fr/pi%C3%A8ces/2/markets_tab --> LTC
https://www.coingecko.com/fr/pi%C3%A8ces/3/markets_tab --> AUR
https://www.coingecko.com/fr/pi%C3%A8ces/?/markets_tab --> ?
https://www.coingecko.com/fr/pi%C3%A8ces/100/markets_tab --> XLM
from bs4 import BeautifulSoup
from time import sleep
import requests
i = 0
while(True):
try:
if i == 0:
url = "https://www.coingecko.com/fr/pi%C3%A8ces/1/markets_tab"
else:
url = "https://www.coingecko.com/fr/pi%C3%A8ces/{}/markets_tab".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
print(url)
sleep(2)
i += 2
except:
break
I want to scan all numbers from 1 to 100 in order to find the associate coin using python.

IDK what you are looking for. your question is unclear. anyway using the following code, you should be able to loop and then you can do what you want.
import requests
from bs4 import BeautifulSoup
with requests.Session() as req:
for item in range(1, 101):
r = req.get(f"https://www.coingecko.com/fr/pi%C3%A8ces/{}/markets_tab")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
# Do whatever.

Related

How do you iterate over BS4 elements that has the same name?

It only scrapes the first table and I'm not sure on how to get it to scrape the second, they both have the same class.
from bs4 import BeautifulSoup
import requests
def getCalendarData(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table',class_ = 'ms-schedule-table ms-schedule-table--your' ):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
getCalendarData('https://www.motorsport.com/all/schedule/2022/upcoming/')

Your question is misleading, there is no second table on this page, there is only the option to load more data.
Unless you want to switch to selenium, you can also address the resource from which the data is dynamically reloaded.
for p in range(1,3,1):
getCalendarData(f'https://www.motorsport.com/all/schedule/2022/upcoming/?all_event_types=1&p={p}')
Example
A bit more generic with while-loop, to check if there is a load more button:
from bs4 import BeautifulSoup
import requests
url = 'https://www.motorsport.com/all/schedule/2022/upcoming/'
def getCalendarData(table):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
while True:
print(f'Scraping url: {url}')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
getCalendarData(soup.find('table',class_ = 'ms-schedule-table ms-schedule-table--your'))
if soup.select_one('[data-id="nextPage"]'):
url = 'https://www.motorsport.com/'+soup.select_one('[data-id="nextPage"]').get('href')
else:
break

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.

You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]

import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])

I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)

As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

Duplicates links in python

Good morning world
I'm new to python and trying out things. I'm trying to remove duplicate links from the below run.
currently their are 253 links that were retrieved. Can someone please help me with this?
import requests
from bs4 import BeautifulSoup
import csv
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
links = soup.find_all("a")
print ('Number of links retrieved: ', len (links))

Convert it to a set and it will remove duplicates:
links = set(soup.find_all("a"))
out:
Number of links retrieved: 244

set will not care about the sort order.
Therefor i used a list with cleaning the href correctly.
Now the len is 123
from bs4 import BeautifulSoup
import requests
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
soup = BeautifulSoup(r.text, 'html.parser')
links = []
for item in soup.findAll("a", href=True):
item = item.get("href")
if item.startswith("h"):
pass
else:
item = f"https://www.census.gov/{item}"
if item not in links:
links.append(item)
print(item)
print(len(links))
Output:
https://www.census.gov/#content
https://www.census.gov/en.html
https://www.census.gov/topics/population/age-and-sex.html
https://www.census.gov/businessandeconomy
https://www.census.gov/topics/education.html
https://www.census.gov/topics/preparedness.html
https://www.census.gov/topics/employment.html
https://www.census.gov/topics/families.html
https://www.census.gov/topics/population/migration.html
https://www.census.gov/geo
https://www.census.gov/topics/health.html
https://www.census.gov/topics/population/hispanic-origin.html
https://www.census.gov/topics/housing.html
https://www.census.gov/topics/income-poverty.html
https://www.census.gov/topics/international-trade.html
https://www.census.gov/topics/population.html
https://www.census.gov/topics/population/population-estimates.html
https://www.census.gov/topics/public-sector.html
https://www.census.gov/topics/population/race.html
https://www.census.gov/topics/research.html
https://www.census.gov/topics/public-sector/voting.html
https://www.census.gov/about/index.html
https://www.census.gov/data
https://www.census.gov/academy
https://www.census.gov/about/what/admin-data.html
https://www.census.gov/data/data-tools.html
https://www.census.gov/developers/
https://www.census.gov/data/experimental-data-products.html
https://www.census.gov/data/related-sites.html
https://www.census.gov/data/software.html
https://www.census.gov/data/tables.html
https://www.census.gov/data/training-workshops.html
https://www.census.gov/library/visualizations.html
https://www.census.gov/library.html
https://www.census.gov/AmericaCounts
https://www.census.gov/library/audio.html
https://www.census.gov/library/fact-sheets.html
https://www.census.gov/library/photos.html
https://www.census.gov/library/publications.html
https://www.census.gov/library/video.html
https://www.census.gov/library/working-papers.html
https://www.census.gov/programs-surveys/are-you-in-a-survey.html
https://www.census.gov/programs-surveys/decennial-census/2020census-redirect.html
https://www.census.gov/2020census
https://www.census.gov/programs-surveys/acs
https://www.census.gov/programs-surveys/ahs.html
https://www.census.gov/programs-surveys/abs.html
https://www.census.gov/programs-surveys/asm.html
https://www.census.gov/programs-surveys/cog.html
https://www.census.gov/programs-surveys/cbp.html
https://www.census.gov/programs-surveys/cps.html
https://www.census.gov/EconomicCensus
https://www.census.gov/internationalprograms
https://www.census.gov/programs-surveys/metro-micro.html
https://www.census.gov/popest
https://www.census.gov/programs-surveys/popproj.html
https://www.census.gov/programs-surveys/saipe.html
https://www.census.gov/programs-surveys/susb.html
https://www.census.gov/programs-surveys/sbo.html
https://www.census.gov/sipp/
https://www.census.gov/programs-surveys/surveys-programs.html
https://www.census.gov/newsroom.html
https://www.census.gov/partners
https://www.census.gov/programs-surveys/sis.html
https://www.census.gov/NAICS
https://www.census.gov/library/reference/code-lists/schedule/b.html
https://www.census.gov/data/developers/data-sets/Geocoding-services.html
https://www.census.gov/about-us
https://www.census.gov/about/who.html
https://www.census.gov/about/what.html
https://www.census.gov/about/business-opportunities.html
https://www.census.gov/careers
https://www.census.gov/fieldjobs
https://www.census.gov/about/history.html
https://www.census.gov/about/policies.html
https://www.census.gov/privacy
https://www.census.gov/regions
https://www.census.gov/about/contact-us/staff-finder.html
https://www.census.gov/about/contact-us.html
https://www.census.gov/about/faqs.html
https://www.commerce.gov/
https://www.census.gov//en.html
https://www.census.gov//programs-surveys.html
https://www.census.gov//popest
https://www.census.gov//programs-surveys/popest/about.html
https://www.census.gov//programs-surveys/popest/data.html
https://www.census.gov//programs-surveys/popest/geographies.html
https://www.census.gov//programs-surveys/popest/guidance.html
https://www.census.gov//programs-surveys/popest/guidance-geographies.html
https://www.census.gov//programs-surveys/popest/library.html
https://www.census.gov//programs-surveys/popest/news.html
https://www.census.gov//programs-surveys/popest/technical-documentation.html
https://www.census.gov//programs-surveys/popest/data/tables.html
https://www.census.gov//programs-surveys/popest/about/schedule.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation/popest-nation-spanish.html
https://www.census.gov//newsroom/press-releases/2019/new-years-2020.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-national.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-state.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-county.html
https://www.census.gov//library/publications/2015/demo/p25-1142.html
https://www.census.gov//library/publications/2010/demo/p25-1139.html
https://www.census.gov//library/publications/2010/demo/p25-1138.html
https://www.census.gov//programs-surveys/popest/library/publications.html
https://www.census.gov//library/visualizations/2020/comm/superbowl.html
https://www.census.gov//library/visualizations/2019/comm/slower-growth-nations-pop.html
https://www.census.gov//library/visualizations/2019/comm/happy-new-year-2020.html
https://www.census.gov//programs-surveys/popest/library/visualizations.html
https://www.census.gov/#
https://www.census.gov/#uscb-nav-skip-header
https://www.census.gov/newsroom/blogs.html
https://www.census.gov/newsroom/stories.html
https://www.facebook.com/uscensusbureau
https://twitter.com/uscensusbureau
https://www.linkedin.com/company/us-census-bureau
https://www.youtube.com/user/uscensusbureau
https://www.instagram.com/uscensusbureau/
https://www.census.gov/quality/
https://www.census.gov/datalinkage
https://www.census.gov/about/policies/privacy/privacy-policy.html#accessibility
https://www.census.gov/foia
https://www.usa.gov/
https://www.census.gov//
123

How do I handle exceptions in BeautifulSoup if the element I'm looking for is not found?

I am making an http request to a website and parse its content to find some attribute values. What I need to know is how do I handle exceptions if code returns [], None or nothing at all.
What I have tried:
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
def get_url():
s = requests.Session()
retries = Retry(total=5,
backoff_factor=10
status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
r = s.get('http://httpstat.us/500')
def find_data():
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
except:
print('id not found')
get_url()
Basically if the id is not to find I want to make that GET request again and try to find it.

You can apply the "look before you leap" (LBYL) principle and check the result of find() - it would return None if an element was not found. You can then put the thing into a loop and exit when you have a value, also safeguarding yourself with a loop counter limit:
RETRIES = 10
id = None
session = requests.Session()
for attempt in range(1, RETRIES + 1):
response = session.get(url)
soup = BeautifulSoup(r.text, "lxml")
element = soup.find('a', class_="class", id=True)
if element is None:
print("Attempt {attempt}. Element not found".format(attempt=attempt))
continue
else:
id = element["id"]
break
print(id)
Couple notes:
id=True was set to find only elements with the id element present. You could also do an equivalent with a CSS selector soup.select_one("a.class[id]")
Session() helps to improve performance when issuing requests to the same host multiple times. See more at Session Objects

If all you want to do is make that same request a second time, you could do something like this:
import requests
from bs4 import BeautifulSoup
def find_data(url):
found_data = False
while not found_data:
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
found_data = True
except:
pass
This puts you at risk of an infinite loop if the data really aren't there. You can do this to avoid that infinite loop:
import requests
from bs4 import BeautifulSoup
def find_data(url, attempts_before_fail=3):
found_data = False
while not found_data:
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
found_data = True
except:
attempts_before_fail -= 1
if attempts_before_fail == 0:
raise ValueError("couldn't find data after all.")

beautifulsoup and request.post

I practice scraping one site.
I got some mysterious situation.
import requests
from bs4 import BeautifulSoup
import json
class n_auction(object):
def __init__(self):
self.search_request = {
'lawsup':0,
'lesson':0,
'next_biddate1':'',
'next_biddate2':'',
'state':91,
'b_count1':0,
'b_count2':0,
'b_area1':'',
'b_area2':'',
'special':0,
'e_area1':'',
'e_area2':'',
'si':11,
'gu':0,
'dong':0,
'apt_no':0,
'order':'',
'start':60,
'total_record_val':850,
'detail_search':'',
'detail_class':'',
'recieveCode':'',}
self.headers = {'User-Agent':'Mozilla/5.0',
'Referer':'http://goodauction.land.naver.com/auction/ca_list.php'}
def scrape(self, max_pages):
addr = []
pageno = 0
self.search_request['start'] = pageno
while pageno < max_pages:
payload = json.dumps(self.search_request)
r = requests.post('http://goodauction.land.naver.com/auction/ax_list.php', data=payload ,headers=self.headers)
print(r.text)
s = BeautifulSoup(r.text)
print(s)
if __name__ == '__main__':
scraper = n_auction()
scraper.scrape(30)
when I print(r.text), I got full text.like below picture.
But after passing through beautifulsoup,
I lost some values like below picture.
It's very embarrassing. Help me~~

Switching the parser from the default, lxml, to html.parser worked for me.
Try: s = BeautifulSoup(r.text, 'html.parser')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

find data dictionary behind URL - python

Related

How do you iterate over BS4 elements that has the same name?

webscraping bus stops with beautifulsoup

Duplicates links in python

How do I handle exceptions in BeautifulSoup if the element I'm looking for is not found?

beautifulsoup and request.post

Categories

Resources