So I am trying to scrape osu! stats from my friends profile, when I trying running the code I get "None" here is the code
from bs4 import BeautifulSoup
import requests
html_text = requests.get('https://osu.ppy.sh/users/17906919').text
soup = BeautifulSoup(html_text, 'lxml')
stats = soup.find_all('dl', class_ = 'profile-stats__entry')
print(stats)
The desired data is actually presented within the HTML source under the following script tag"
<script id="json-user" type="application/json">
So we would need to pickup it and parse it as JSON below:
import requests
from bs4 import BeautifulSoup
from pprint import pprint as pp
import json
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = json.loads(soup.select_one('#json-user').string)
pp(goal['statistics'])
main('https://osu.ppy.sh/users/17906919')
Output:
{'country_rank': 18133,
'global_rank': 94334,
'grade_counts': {'a': 159, 's': 99, 'sh': 9, 'ss': 6, 'ssh': 2},
'hit_accuracy': 97.9691,
'is_ranked': True,
'level': {'current': 83, 'progress': 84},
'maximum_combo': 896,
'play_count': 9481,
'play_time': 347925,
'pp': 3868.29,
'rank': {'country': 18133},
'ranked_score': 715205885,
'replays_watched_by_others': 0,
'total_hits': 1086843,
'total_score': 3896191620}
Related
I'm trying to make a testing project that scraps info of a specific site but with no success.
I followed some tutorials i have found and even an post on stackoverflow. After all this I'm stuck!
help me stepbrothers, I'm a hot new programmer with python and I can't stop my projects.
more info: this is a lottery website that I was trying to scrap and make some analisys to get a lucky number.
I have followed this tutorials:
https://towardsdatascience.com/how-to-collect-data-from-any-website-cb8fad9e9ec5
https://beautiful-soup-4.readthedocs.io/en/latest/
Using BeautifulSoup in order to find all "ul" and "li" elements
All of you have my gratitute!
from bs4 import BeautifulSoup as bs
import requests
import html5lib
#import urllib3 # another attemp to make another req in the url ------failed
url = '''https://loterias.caixa.gov.br/Paginas/Mega-Sena.aspx'''
#another try to take results in the <ul> but I have no qualified results == None
def parse_ul(elem):#https://stackoverflow.com/questions/50338108/using-beautifulsoup-in-order-to-find-all-ul-and-li-elements
result = {}
for sub in elem.find_all('li', recursive=False):
if sub.li is None:
continue
data = {k: v for k, v in sub.attrs.items()}
if sub.ul is not None:
# recurse down
data['children'] = parse_ul(sub.ul)
result[sub.li.get_text(strip=True)] = data
return result
page = requests.get(url)#taking info from website
print(page.encoding)# == UTF-8
soup = bs(page.content,features="lxml")#takes all info from the url and organizes it ==Beaultiful soup
numbers = soup.find(id='ulDezenas')#searcher in the content of this specific id// another try: soup.find('ul', {'class': ''})
result = parse_ul(soup)#try to parse info, but none is found EVEN WITH THE ORIGINAL ONE
print(numbers)#The result is below:
'''<ul class="numbers megasena" id="ulDezenas">
<li ng-repeat="dezena in resultado.listaDezenas ">{{dezena.length > 2 ? dezena.slice(1) : dezena}}</li>
</ul>'''
print(result)# == "{}" nothing found
#with open('''D:\Documents\python\_abretesesame.txt''', 'wb') as fd:
# for chunk in page.iter_content(chunk_size=128):
# fd.write(chunk)
# =======printing document(HTML) in file still no success in getting the numbers
Main issue is that the content is provided dynamically by JavaScript but you can get the information via another url:
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
will give you folowing JSON:
{'tipoJogo': 'MEGA_SENA', 'numero': 2468, 'nomeMunicipioUFSorteio': 'SÃO PAULO, SP', 'dataApuracao': '02/04/2022', 'valorArrecadado': 158184963.0, 'valorEstimadoProximoConcurso': 3000000.0, 'valorAcumuladoProximoConcurso': 0.0, 'valorAcumuladoConcursoEspecial': 36771176.89, 'valorAcumuladoConcurso_0_5': 33463457.98, 'acumulado': False, 'indicadorConcursoEspecial': 1, 'dezenasSorteadasOrdemSorteio': ['022', '041', '053', '042', '035', '057'], 'listaResultadoEquipeEsportiva': None, 'numeroJogo': 2, 'nomeTimeCoracaoMesSorte': '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'tipoPublicacao': 3, 'observacao': '', 'localSorteio': 'ESPAÇO DA SORTE', 'dataProximoConcurso': '06/04/2022', 'numeroConcursoAnterior': 2467, 'numeroConcursoProximo': 2469, 'valorTotalPremioFaixaUm': 0.0, 'numeroConcursoFinal_0_5': 2470, 'listaDezenas': ['022', '035', '041', '042', '053', '057'], 'listaDezenasSegundoSorteio': None, 'listaMunicipioUFGanhadores': [{'posicao': 1, 'ganhadores': 1, 'municipio': 'SANTOS', 'uf': 'SP', 'nomeFatansiaUL': '', 'serie': ''}], 'listaRateioPremio': [{'faixa': 1, 'numeroDeGanhadores': 1, 'valorPremio': 122627171.8, 'descricaoFaixa': '6 acertos'}, {'faixa': 2, 'numeroDeGanhadores': 267, 'valorPremio': 34158.18, 'descricaoFaixa': '5 acertos'}, {'faixa': 3, 'numeroDeGanhadores': 20734, 'valorPremio': 628.38, 'descricaoFaixa': '4 acertos'}], 'id': None, 'ultimoConcurso': True, 'exibirDetalhamentoPorCidade': True, 'premiacaoContingencia': None}
Simply extract dezenasSorteadasOrdemSorteio and prozess in list comprehension:
[n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']]
Result will be:
['22', '35', '41', '42', '53', '57']
Example
import requests
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
print([n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']])
When I was using Beautifulsoup and requests module to scrape the img's src, all the img
s src are empty so then I'm assuming that the src value is generated by JavaScript. Hence, I tried to use the requests_html module instead. However, when I trying to scrape the same information after the response is rendered, only two of the img 's src has value and the rest are empty but the problem is that when I checked it on the website using developer tools, it seems that the other img's src should have a value. May I know what is the problem here?
code for bs4 and requests
from bs4 import BeautifulSoup
import requests
biliweb = requests.get('https://www.bilibili.com/ranking/bangumi/13/0/3').text
bilisoup = BeautifulSoup(biliweb,'lxml')
for item in bilisoup.find_all('div',class_='lazy-img'):
image_html = item.find('img')
print(image_html)
code for requets_html
from requests_html import HTML, HTMLSession
session = HTMLSession()
biliweb = session.get('https://www.bilibili.com/ranking/bangumi/13/0/3')
biliweb.html.render()
for item in biliweb.html.find('.lazy-img.cover > img'):
print(item.html)
I will only show the first five results because the list is quite lengthy
With Beautifulsoup and requests
<img alt="Re:从零开始的异世界生活 第二季" src=""/>
<img alt="刀剑神域 爱丽丝篇 异界战争 -终章-" src=""/>
<img alt="没落要塞 / DECA-DENCE" src=""/>
<img alt="某科学的超电磁炮T" src=""/>
<img alt="宇崎学妹想要玩!" src=""/>
With requests_html
<img alt="Re:从零开始的异世界生活 第二季" src="https://i0.hdslb.com/bfs/bangumi/image/f2425cbdb07cc93bd0d3ba1c0099bfe78f5dc58a.png#90w_120h.webp"/>
<img alt="刀剑神域 爱丽丝篇 异界战争 -终章-" src="https://i0.hdslb.com/bfs/bangumi/image/54d9ca94ca84225934e0108417c2a1cc16be38fb.png#90w_120h.webp"/>
<img alt="没落要塞 / DECA-DENCE" src=""/>
<img alt="某科学的超电磁炮T" src=""/>
<img alt="宇崎学妹想要玩!" src=""/>
All the data is stored in a javascript variable called __INITIAL_STATE__.
The following script saves the data in a json file. Once you have this, you can easily download the images.
import requests, json
from bs4 import BeautifulSoup
page = requests.get('https://www.bilibili.com/ranking/bangumi/13/0/3')
soup = BeautifulSoup(page.content, 'html.parser')
script = None
for s in soup.find_all("script"):
if "__INITIAL_STATE__" in s.text:
script = s.get_text(strip=True)
break
data = json.loads(script[script.index('{'):script.index('function')-2])
with open("data.json", "w") as f:
json.dump(data, f)
print(data)
Output:
{'rankList': [{'badge': '会员抢先', 'badge_info': {'bg_color': '#FB7299', 'bg_color_night': '#BB5B76', 'text': '会员抢先'}, 'badge_type': 0, 'copyright': 'bilibili', 'cover': 'http://i0.hdslb.com/bfs/bangumi/image/f2425cbdb07cc93bd0d3ba1c0099bfe78f5dc58a.png', 'new_ep': {'cover': 'http://i0.hdslb.com/bfs/archive/2f5bf4840747fc7c09932d2793e96a178cd05905.jpg', 'index_show': '更新至第5话'}, 'pts': 1903981, 'rank': 1, 'season_id': 33802, 'stat': {'danmaku': 814356, 'follow': 7135303, 'series_follow': 7267882, 'view': 33685387}, 'title': 'Re:从零开始的异世界生活 第二季', 'url': 'https://www.bilibili.com/bangumi/play/ss33802', 'pic': 'http://i0.hdslb.com/bfs/bangumi/image/f2425cbdb07cc93bd0d3ba1c0099bfe78f5dc58a.png', 'play': 33685387, 'video_review': 814356}, {'badge': '会员抢先', 'badge_info': {'bg_color': '#FB7299', 'bg_color_night': '#BB5B76', 'text': '会员抢先'}, 'badge_type': 0, 'copyright': 'bilibili', 'cover': 'http://i0.hdslb.com/bfs/bangumi/image/54d9ca94ca84225934e0108417c2a1cc16be38fb.png', 'new_ep': {'cover': 'http://i0.hdslb.com/bfs/archive/a772451f1f031ee1a3b78e31e4fb0b851517817f.jpg', 'index_show': '更新至第16话'}, 'pts': 483317, 'rank': 2, 'season_id': 32781, 'stat': {'danmaku': 514174, 'follow': 6195736, 'series_follow': 6733547, 'view': 36351270}, 'title': '刀剑神域 爱丽丝篇 异界战争 -终章-', 'url': 'https://www.bilibili.com/bangumi/play/ss32781', 'pic': 'http://i0.hdslb.com/bfs/bangumi/image/54d9ca94ca84225934e0108417c2a1cc16be38fb.png', 'play': 36351270, 'video_review': 514174}, {'badge': '会员抢先', 'badge_info': {'bg_color': '#FB7299', 'bg_color_night': '#BB5B76', 'text': '会员抢先'}, 'badge_type': 0, 'copyright': 'bilibili', 'cover': 'http://i0.hdslb.com/bfs/bangumi/image/d5d7441c20614dc5ddc69f333f1906a09eddcee2.png', 'new_ep': {'cover': 'http://i0.hdslb.com/bfs/archive/fe191e9ffa2422103bffcd8615446f5885074c0b.jpg', 'index_show': '更新至第5话'}, 'pts': 455170, 'rank': 3, 'season_id': 33803, 'stat': ....
...
...
...
I am trying to scrape data from Fangraphs. The tables are split into 21 pages but all of the pages use the same url. I am very new to webscraping (or python in general), but Fangraphs does not have a public API so scraping the page seems to be my only option. I am currently using BeautifulSoup to parse the HTML code and I am able to scrape the initial table, but that only contains the first 30 players, but I want the entire player pool. Two days of web searching and I am stuck. Link and my current code are below. I know they have a link to download the csv file, but that gets tedious through out the season and I would like expedite the data harvesting process. Any direction would be helpful, thank you.
https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc
import requests
import pandas as pd
url = 'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc&team=0&lg=all&players=0'
response = requests.get(url, verify=False)
# Use BeautifulSoup to parse the HTML code
soup = BeautifulSoup(response.content, 'html.parser')
# changes stat_table from ResultSet to a Tag
stat_table = stat_table[0]
# Convert html table to list
rows = []
for tr in stat_table.find_all('tr')[1:]:
cells = []
tds = tr.find_all('td')
if len(tds) == 0:
ths = tr.find_all('th')
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
# convert table to df
table = pd.DataFrame(rows)
import requests
from bs4 import BeautifulSoup
import pandas as pd
params = {
"pos": "all",
"stats": "bat",
"type": "fangraphsdc"
}
data = {
'RadScriptManager1_TSM': 'ProjectionBoard1$dg1',
"__EVENTTARGET": "ProjectionBoard1$dg1",
'__EVENTARGUMENT': 'FireCommand:ProjectionBoard1$dg1$ctl00;PageSize;1000',
'__VIEWSTATEGENERATOR': 'C239D6F0',
'__SCROLLPOSITIONX': '0',
'__SCROLLPOSITIONY': '1366',
"ProjectionBoard1_tsStats_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1_tsPosition_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1$rcbTeam": "All+Teams",
"ProjectionBoard1_rcbTeam_ClientState": "",
"ProjectionBoard1$rcbLeague": "All",
"ProjectionBoard1_rcbLeague_ClientState": "",
"ProjectionBoard1_tsProj_ClientState": "{\"selectedIndexes\":[\"5\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1_tsUpdate_ClientState": "{\"selectedIndexes\":[],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1$dg1$ctl00$ctl02$ctl00$PageSizeComboBox": "30",
"ProjectionBoard1_dg1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState": "",
"ProjectionBoard1$dg1$ctl00$ctl03$ctl01$PageSizeComboBox": "1000",
"ProjectionBoard1_dg1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState": "{\"logEntries\":[],\"value\":\"1000\",\"text\":\"1000\",\"enabled\":true,\"checkedIndices\":[],\"checkedItemsTextOverflows\":false}",
"ProjectionBoard1_dg1_ClientState": ""
}
def main(url):
with requests.Session() as req:
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'html.parser')
data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
data['__EVENTVALIDATION'] = soup.find(
"input", id="__EVENTVALIDATION").get("value")
r = req.post(url, params=params, data=data)
df = pd.read_html(r.content, attrs={
'id': 'ProjectionBoard1_dg1_ctl00'})[0]
df.drop(df.columns[1], axis=1, inplace=True)
print(df)
df.to_csv("data.csv", index=False)
main("https://www.fangraphs.com/projections.aspx")
Output: view-online
My code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myUrl = 'https://www.rebuy.de/kaufen/videospiele-nintendo-switch?
page=1'
#opening up connection, grabbing the page
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each product
containers = page_soup.find_all("div", class_="ry-product__item ry-
product__item--large")
I want to extract item containers that hold image, title and price from this website. When I run this code it returns empty list
[]
I am sure the code works because when I type for example class_="row" it returns tags that this class contains.
I want to extract all the containers that have this class(Screenshot below) but it seems like I am choosing wrong class or because there are multiple classes in this <div> tag. What am I doing wrong?
The site loads the products dynamically through Ajax. Looking at the Chrome/Firefox network inspector reveals the address of API. The site loads the product data from there (https://www.rebuy.de/api/search?page=1&categorySanitizedPath=videospiele-nintendo-switch):
import requests
import json
from pprint import pprint
headers = {}
# headers = {"Host":"www.rebuy.de",
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Cookie":"SET THIS TO PREVENT ACCESS DENIED",
# "Accept-Encoding":"gzip,deflate,br",
# "User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}
url = "https://www.rebuy.de/api/search?page={}&categorySanitizedPath=videospiele-nintendo-switch"
page = 1
r = requests.get(url.format(page), headers=headers)
data = json.loads(r.text)
pprint(data['products'])
# print(json.dumps(data, indent=4, sort_keys=True))
Prints:
{'docs': [{'avg_rating': 5,
'badges': [],
'blue_price': 1999,
'category_id': {'0': 94, '1': 3098},
'category_is_accessory': False,
'category_name': 'Nintendo Switch',
'category_sanitized_name': 'nintendo-switch',
'cover_updated_at': 0,
'has_cover': True,
'has_percent_category': False,
'has_variant_in_stock': True,
'id': 10725297,
'name': 'FIFA 18',
'num_ratings': 1,
'price_min': 1999,
'price_recommended': 0,
'product_sanitized_name': 'fifa-18',
'root_category_name': 'Videospiele',
'variants': [{'label': 'A1',
'price': 2199,
'purchasePrice': 1456,
'quantity': 2},
{'label': 'A2',
'price': 1999,
'purchasePrice': 1919,
'quantity': 7},
{'label': 'A3',
'price': 1809,
'purchasePrice': 1919,
'quantity': 0},
{'label': 'A4',
'price': 1409,
'purchasePrice': 1919,
'quantity': 0}]},
...and so on.
One caveat, when many requests are made, the site returns Access Denied. To prevent this, you need to set headers with Cookie from your browser (to get the cookie, look inside Chrome/Firefox network inspector).
Better solution would be use of Selenium.
The issue is that these DOM elements were loaded dynamically via AJAX. If you view the source code of this site, you won't be able to find any of these classes because they haven't been created yet. One solution is to make the same request that the page does and extract the data from the response as shown here.
Another approach is to use a tool like Selenium to load these elements and interact with them dynamically.
Here's some code to retrieve and print the fields you're interested in. Hopefully this will get you started. This requires installing Chromedriver.
Note that I took the liberty to parse the results with regex a bit, but that's not critical.
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("https://www.rebuy.de/kaufen/videospiele-nintendo-switch")
for product in driver.find_elements_by_tag_name("product"):
name_elem = product.find_element_by_class_name("ry-product-item-content__name")
print("name:\t", name_elem.get_attribute("innerHTML"))
image_elem = product.find_element_by_class_name("ry-product-item__image")
image = str(image_elem.value_of_css_property("background-image"))
print("image:\t", re.search(r"^url\((.*)\)$", image).group(1))
price_elem = product.find_element_by_class_name("ry-price__amount")
price = str(price_elem.get_attribute("innerHTML").encode("utf-8"))
print("price:\t", re.search(r"\d?\d,\d\d", price).group(0), "\n")
Output (60 results):
name: Mario Kart 8 Deluxe
image: "https://d2wr8zbg9aclns.cloudfront.net/products/010/574/253/covers/205.jpeg?time=0"
price: 43,99
name: Super Mario Odyssey
image: "https://d2wr8zbg9aclns.cloudfront.net/products/010/574/263/covers/205.jpeg?time=1508916366"
price: 40,69
...
name: South Park: Die Rektakuläre Zerreißprobe
image: "https://d2wr8zbg9aclns.cloudfront.net/products/default/205.jpeg?time=0"
price: 35,99
name: Cars 3: Driven To Win [Internationale Version]
image: "https://d2wr8zbg9aclns.cloudfront.net/products/010/967/629/covers/205.jpeg?time=1528267000"
price: 30,99
JSON RESPONSE FROM WEBSITE I am new to python scrapy and json . I am trying to scrape json response from 78751 . But it is showing error . The code i used is
import scrapy
import json
class BlackSpider(scrapy.Spider):
name = 'black'
start_urls = ['https://appworld.blackberry.com/cas/content/2360/reviews/2.17.2?page=1&pagesize=100&sortby=newest&callback=_content_2360_reviews_2_17_2&_=1499161778751']
def parse(self, response):
data = re.findall('(\{.+\})\);', response.body_as_unicode())
a=json.loads(data[0])
item = MyItem()
item["Reviews"] = a["reviews"][4]["review"]
return item
The error it is showing is
ValueError("No JSON object could be decoded")ERROR
The response you are getting is javascript function with some json in it:
_content_2360_reviews_2_17_2(\r\n{"some":"json"}]});\r\n
To extract the data from this you can use simple regex solution:
import re
import json
data = re.findall('(\{.+\})\);', response.body_as_unicode())
json.loads(data[0])
It trasnslates to: select everything between {} that ends with );
edit: results I'm getting with this:
{'platform': None,
'reviews': [{'createdDate': '2017-07-04',
'model': 'London',
'nickname': 'aravind14-92362',
'rating': 6,
'review': 'Very bad ',
'title': 'My WhatsApp no update '}],
'totalReviews': 569909,
'version': '2.17.2'}