I'm trying to extract data from this API:https://www.balldontlie.io/#get-all-stats with the following code in python:
import requests
import json
import time
total_results = []
pages_to_read = 11000
counter = 0
for page_num in range(1, pages_to_read + 1):
url = "https://balldontlie.io/api/v1/stats?per_page=100&page=" + str(page_num)
print("reading", url)
response = requests.get(url)
data = response.json()
total_results = total_results + data['data']
counter = counter+ 1
print(counter)
if counter == 59:
counter = 0
print('break')
time.sleep(60)
print("Total of", len(total_results), "results")
with open('test.json', 'w', encoding='utf-8') as d:
json.dump(total_results, d, ensure_ascii=False, indent=4)
However, I always get this error: https://cdn1.gnarususercontent.com.br/1/292460/5c2858ca-33df-4bd8-9b44-0d50a48ab3e0.png
The API should support 60 requests per second, sometimes it even goes beyond 60, but in the end it always gets this error. Does anyone have any suggestions to help me?
PS: I would need data from all 11000 pages of 'stats'. Only the json 'data' data of the page, not counting the 'meta data' and the page number.
Found this answer:
import requests
import json
import time
from datetime import datetime
resultados_totais = []
paginas_totais_para_ler = 11486
def get_api_data(page_num):
url = "https://balldontlie.io/api/v1/stats?per_page=100&page=" + str(page_num)
current_time = datetime.now().strftime("%H:%M:%S")
print("{} - Lendo {}".format(current_time, url))
start_request = datetime.now()
response = requests.get(url)
end_request = datetime.now()
delta = end_request - start_request
if delta.seconds <= 0:
time.sleep(1)
if response.status_code == 200:
data = response.json()
response.raise_for_status()
return data['data']
else:
current_time = datetime.now().strftime("%H:%M:%S")
print("{} - ({}) Limite de request ... aguardando 15 seg para tentar novamente".format(current_time, response.status_code))
time.sleep(15)
return get_api_data(page_num)
for page_num in range(1, paginas_totais_para_ler + 1):
resultados_totais.extend(get_api_data(page_num))
if page_num == paginas_totais_para_ler:
print('\nSalvando Arquivo json..')
with open('all_stats.json', 'w', encoding='utf-8') as d:
json.dump(resultados_totais, d, ensure_ascii=False, indent=4)
print('\nArquivo Salvo!!')
print("\nTemos um total de", len(resultados_totais), "resultados.")
Related
I am having some troubles with my program as when it reaches the end of the third() function, it continues to try to execute transactions. I tried having it return None to break out of the seemly infinite loop that it is in with no success. I am sure that I am missing something very simple here and am guessing it has something to do with the recursion that I used. Thanks for any help that you can provide.
import asyncio
import base64
import json
import os
import os.path
import time
import httpcore
import requests
from typing import Awaitable
import solana
import httpx
from rich import print
from solana.keypair import Keypair
from solana.publickey import PublicKey
from solana.rpc.api import Client
from solana.rpc.async_api import AsyncClient
from solana.rpc.commitment import Confirmed
from solana.rpc.types import TxOpts
from solana.transaction import Transaction
# Notes
# This is meant as a bare bones hello world and as such does not have :
#
# - error handling on http calls
# - checks / retries to ensure solana transactions go through
# - logging - just your basic print statement here. But at least you get the Rich pretty printing variant :)
#
# Libraries used
# - https://www.python-httpx.org/ - cause it's shinier and better than requests
# - https://michaelhly.github.io/solana-py/
# - https://github.com/Textualize/rich for pretty printing - because it rocks.
# I use poetry to manage dependencies but am not including the project file here for brevity.
# Mint constants
USDC_MINT = "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", 6
SOL_MINT = "So11111111111111111111111111111111111111112", 9
FAB_MINT = "EdAhkbj5nF9sRM7XN7ewuW8C9XEUMs8P7cnoQ57SYE96", 9
FUSD_MINT = "B7mXkkZgn7abwz1A3HnKkb18Y6y18WcbeSkh1DuLMkee", 8
# This works ok - most of the time
rpc_host = "https://api.mainnet-beta.solana.com"
filename = r"C:\Users\myname\.config\solana\burner.json"
def get_wallet_keypair(filename: str) -> Keypair:
"""Load a keypair from a filesystem wallet."""
if not os.path.isfile(filename):
raise Exception(f"Wallet file '{filename}' is not present.")
with open(filename) as json_file:
data = json.load(json_file)
mid = len(data) // 2
secret_key = data[:mid]
secret_bytes = bytes(secret_key)
keypair = Keypair.from_secret_key(secret_bytes)
print(f"Public Key is: {keypair.public_key}")
return keypair
async def get_quote(
input_mint: str, output_mint: str, amount: int, slippage: int = 0.2):
url_query = f"https://quote-api.jup.ag/v1/quote?outputMint={output_mint}&inputMint={input_mint}&amount={amount}&slippage={slippage}"
print(url_query)
async with httpx.AsyncClient() as client:
r = await client.get(url_query)
return r.json()
async def get_transaction(route: dict, user_key: str) -> dict:
swap_url = "https://quote-api.jup.ag/v1/swap"
input = {"route": route, "userPublicKey": user_key, "wrapUnwrapSOL": True}
print(json.dumps(input, indent=2))
async with httpx.AsyncClient() as client:
r = await client.post(swap_url, json=input,
timeout=6.0) # slightly longer timout as the free rpc server can be a bit laggy
return r.json()
def send_transaction(payer: Keypair, cc: Client, swap_transaction: str, opts: TxOpts) -> str:
""" Send a serialized transaction to the RPC node """
trans = Transaction.deserialize(base64.b64decode(swap_transaction))
result = cc.send_transaction(trans, payer, opts=opts)
txid = result["result"]
print(f"transaction details :https://solscan.io/tx/{txid}")
return txid
async def async_main(from_mint, from_decimals, to_mint, quantity):
cc = Client(rpc_host)
print(f" Converting {quantity} {from_mint} to {to_mint} with {from_decimals} Decimals")
quote_quantity = quantity * (10 ** from_decimals)
r = await get_quote(str(from_mint), str(to_mint), quote_quantity, slippage=2)
quote, outAmount = r["data"][0], int(r['data'][0]['outAmountWithSlippage']) / (10 ** from_decimals)
print("Out Amount =", outAmount)
if quote := r["data"][0]:
print(quote)
# get the relevant transaction details
trans = await get_transaction(quote, str(pubkey))
setup_transaction = trans["setupTransaction"] if "setupTransaction" in trans else None
swap_transaction = trans["swapTransaction"] if "swapTransaction" in trans else None
cleanup_transaction = trans["cleanupTransaction"] if "cleanupTransaction" in trans else None
opts = TxOpts(skip_preflight=True)
# Setup transaction. Will create any missing accounts if required.
if setup_transaction:
print("Sending setup transaction")
#print(setup_transaction)
send_transaction(payer, cc, setup_transaction, opts)
# This one actually does the business
if swap_transaction:
print("Sending swap transaction")
txid = send_transaction(payer, cc, swap_transaction, opts)
# Wait for the transaction to complete before looking it up on chain.
# Clearly this is *not* the right way to do this. Retry in a loop or something fancy.
await asyncio.sleep(20)
result = cc.get_transaction(txid, commitment=Confirmed)
print(result)
# Haven't seen one of these needed yet. Hopefully the jup.ag devs can explain when it's required.
if cleanup_transaction:
print("Sending send transaction")
send_transaction(payer, cc, cleanup_transaction, opts)
print("Swap Complete !")
return outAmount
def get_balance(input_mint):
url = "https://api.mainnet-beta.solana.com"
headers = {'Content-type': 'application/json'}
if input_mint == "So11111111111111111111111111111111111111112":
data = {"jsonrpc": "2.0", "id": 1, "method": "getBalance", "params": [f"{pubkey}"]}
response = requests.post(url, data=json.dumps(data), headers=headers)
response = response.text
parsed = json.loads(response)
# print(json.dumps(parsed, indent=4, sort_keys=True))
accountBal = (parsed['result']['value']) / 10 ** SOL_MINT[1]
print(accountBal)
else:
data = {"jsonrpc": "2.0", "id": 1, "method": "getTokenAccountsByOwner",
"params": [f"{pubkey}",
{"mint": f"{input_mint}"}, {"encoding": "jsonParsed"}]}
response = requests.post(url, data=json.dumps(data), headers=headers)
response = response.text
parsed = json.loads(response)
# print(json.dumps(parsed, indent=4, sort_keys=True))
accountBal = parsed['result']['value'][0]['account']['data']['parsed']['info']['tokenAmount']['uiAmount']
print(accountBal)
return accountBal
# usdc buys fusd fusd is sold for sol sol is sold for usdc
# (from_mint, from_decimals, to_mint, quantity):
class swaps:
def __init__(self, input_mint, decimals, output_mint, amount):
self.input_mint = input_mint
self.decimals = decimals
self.output_mint = output_mint
self.amount = amount
def swap(self):
asyncio.run(async_main(self.input_mint, self.decimals, self.output_mint, self.amount))
def first(count, previous = 0):
try:
if get_balance(USDC_MINT[0]) <= 1:
time.sleep(1)
count += 1
if count >= 60:
third(0)
first(count)
except TypeError:
first(0)
step1 = swaps(USDC_MINT[0], USDC_MINT[1], FUSD_MINT[0], get_balance(USDC_MINT[0]) if previous == 0 else previous)
try:
step1.swap()
except httpx.ReadTimeout:
print("Retrying")
time.sleep(10)
first(0)
second(0)
def second(count, previous = 0):
try:
if get_balance(FUSD_MINT[0]) <= 1:
time.sleep(1)
count += 1
if count >= 60:
first(0)
second(count)
except TypeError:
second(0)
step2 = swaps(FUSD_MINT[0], FUSD_MINT[1], SOL_MINT[0], get_balance(FUSD_MINT[0]) if previous == 0 else previous)
try:
step2.swap()
except:
print("Retrying")
time.sleep(10)
second(0)
count = 0
third(0)
def third(count, previous = 0):
if get_balance(SOL_MINT[0]) < .6:
time.sleep(1)
count += 1
if count >= 60:
second(0)
third(count)
step3 = swaps(SOL_MINT[0], SOL_MINT[1], USDC_MINT[0], get_balance(SOL_MINT[0]) - 0.5 if previous == 0 else previous)
try:
step3.swap()
except:
print("Retrying")
time.sleep(10)
third(previous)
print("All Swaps Completed")
return None
payer = get_wallet_keypair(filename)
pubkey = payer.public_key
loops = 0
if __name__ == "__main__":
previousBalence = get_balance(USDC_MINT[0])
print(f"Starting Balence: {previousBalence}")
#for loops in range(5):
first(0)
loops += 1
endBalance = get_balance((USDC_MINT[0]))
print(f"End balence is {endBalance}")
totalProfit = endBalance-previousBalence
print(f"Total Profit is: {totalProfit}")
Edit: The output when the code continues is it keeps trying to swap fUSD for SOL and SOL for USDC over and over again.
Solution: https://pastebin.com/8id7gfe4
I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
I have this function that fecthes a bunch of images:
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page=0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(player['name']['display'], img_url.format(player_id=player['altIds']['opta'])))
sleep(2)
page+=1
except:
break
How do I dinamically save each image on a 'path/to/image' folder with player['name'].png format?
Here you go :)
import requests
from time import sleep
import urllib.request
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page = 0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(
player['name']['display'],
img_url.format(player_id=player['altIds']['opta'])))
urllib.request.urlretrieve(
img_url.format(player_id=player['altIds']['opta']),
player['name']['display'] + ".png")
sleep(2)
page += 1
except:
break
I wrote the following script to access the age of pixels in the reddit's r/place experiment:
import requests as req
import json
from time import sleep
from requests.adapters import HTTPAdapter
import sys
username = sys.argv[1]
password = sys.argv[2]
location = sys.argv[3]
s = req.Session()
s.mount('https://www.reddit.com', HTTPAdapter(max_retries=5))
s.headers["User-Agent"] = "WebDev Alliance"
r = s.post("https://www.reddit.com/api/login/{}".format(username),
data={"user": username, "passwd": password, "api_type": "json"})
s.headers['x-modhash'] = r.json()["json"]["data"]["modhash"]
def main(rows,columns):
mylist = [[0 for x in range(columns)] for x in range(rows)]
r = {}
dic1 = {}
for i in range(rows):
for j in range(columns):
mylist[i][j] = '%s;%s'%(i,j)
for j in range(columns):
for i in range(rows):
message = "Probing absolute pixel x="+str(i)+"&y="+str(j)
print(message)
while True:
r = s.get('https://www.reddit.com/api/place/pixel.json?x='+str(i)+'&y='+str(j), timeout=60)
if r.status_code == 200:
dic1[str(i)+' '+str(j)] = r.json()
sleep(0.1)
break
else:
sleep(1)
print("ERROR: ", r, r.text)
with open(location, 'w') as outfile:
json.dump(dic1, outfile, sort_keys = True, indent = 4)
main(1000,1000)
However, there are two clear problems with the script:
It will take ages to complete at best (0.1 sec * 1000 * 1000 ~= 28 hours)
My IP might be banned
plus the code itself does not handle exceptions.
That is why I am wondering if there is a better method of accessing the last modification date of each of the pixels on reddit.com/r/place
Thanks!
i want to create simple website for pnr check . code is working right but it performs only one work at a time either render result to other page or send mail . while working with thread it send mail on back end till then page remains loading .
please anyone give me suggestion. also i want to run it on google appengine so i haven't tried celery.
from django.http import HttpResponse
from bs4 import BeautifulSoup
import re
import requests
from django.shortcuts import render
from functools import partial, wraps
from django.core.mail import send_mail
import time
import thread
def checkpnr(request):
return render(request, 'checkpnr.html')
def check(string, pnr, sleeptime, lock, *args):
while 1:
# entering critical section
lock.acquire()
# time1=request.get_all("notif")
url_pnr = "pnr url"
r = requests.get(url_pnr)
data = r.text
soup = BeautifulSoup(data)
train = str(soup.find("ul", attrs={"class": "train_info"}))
train_number = soup.find("li", attrs={"class": "first"}).text
source = str(soup.find("travellers"))
route = str(soup.findAll("li")[1]).replace(
'<li>', '').replace('</li>', '')
#head, sep, tail = route.partition(' -')
travel_date = str(soup.findAll("li")[2].text)
date, sep, total = travel_date.partition('|')
rows = soup.findAll("td", attrs={"class": "pax"})
rowlength = len(rows)
chart_status = str(soup.findAll("tr")[rowlength + 1].findAll("td")[0]).replace(
'<td colspan="3"><strong>', '').replace('</strong>', '').replace('</td>', '')
passengers = []
status = []
coach = []
tot = []
w=''
i = 1
j = 1
while i <= rowlength:
j = str(soup.findAll("tr")[i].findAll(
"td")[0].text).replace(':', '')
passengers.append(j)
s = str(soup.findAll("tr")[i].findAll("td")[1].text)
w=w+','+s
status.append(s)
c = str(soup.findAll("tr")[i].findAll("td")[2].text)
coach.append(c)
tot.append(i)
i += 1
time.sleep(sleeptime)
emailMsg = status
subject = pnr+'-'+w
send_mail(
subject,'emailMsg', 'email-from',
[email], fail_silently=False)
lock.release()
if (status[rowlength - 1] == "CONFIRMED"):
time.sleep(sleeptime)
else:
time.sleep(1000000000000000000000000)
def fetch(request):
pnr = request.POST['pnr']
if len(pnr) != 10:
msg = "PNR must be of 10 digits ..."
return render(request, 'checkpnr.html', {'msg': msg})
email = request.POST['email']
e = request.POST['ntime']
if (e != ''):
n_time = int(e)
n = request.POST['notify']
if (n != ''):
notify = int(n)
sleeptim = notify * n_time
sleeptime= 10
# time1=request.get_all("notif")
url_pnr = "pnr url"
try:
r = requests.get(url_pnr)
data = r.text
soup = BeautifulSoup(data)
train = str(soup.find("ul", attrs={"class": "train_info"}))
train_number = soup.find("li", attrs={"class": "first"}).text
source = str(soup.find("travellers"))
route = str(soup.findAll("li")[1]).replace(
'<li>', '').replace('</li>', '')
#head, sep, tail = route.partition(' -')
travel_date = str(soup.findAll("li")[2].text)
date, sep, total = travel_date.partition('|')
rows = soup.findAll("td", attrs={"class": "pax"})
rowlength = len(rows)
chart_status = str(soup.findAll("tr")[rowlength + 1].findAll("td")[0]).replace(
'<td colspan="3"><strong>', '').replace('</strong>', '').replace('</td>', '')
passengers = []
status = []
coach = []
tot = []
w=''
i = 1
j = 1
while i <= rowlength:
j = str(soup.findAll("tr")[i].findAll(
"td")[0].text).replace(':', '')
passengers.append(j)
s = str(soup.findAll("tr")[i].findAll("td")[1].text)
w=w+','+s
status.append(s)
c = str(soup.findAll("tr")[i].findAll("td")[2].text)
coach.append(c)
tot.append(i)
i += 1
msg = "Mail not Sent"
msg1 = ''
if(email != ''):
emailMsg = status
subject = pnr+'-'+w
send_mail(
subject,'emailMsg', 'ashutosh8nitjsr#gmail.com',
[email], fail_silently=False)
msg = "mail sent.."
if __name__ == "__main__":
lock = thread.allocate_lock()
thread.start_new_thread(
check,("Thread No:1", pnr, email, sleeptime, lock))
msg1 = "thread created"
time.sleep(sleeptime)
while 1:
pass
detail2 = {
'train_number': train_number, 'route': route, 'date': date, 'chart_status': chart_status, 'tot': tot,
'passengers': passengers, 'status': status, 'coach': coach, 'msg': msg}
return render(request, 'status.html', detail2)l
except:
msg = "there was error. please try again..."
return render(request, 'checkpnr.html', {'msg': msg})
You can try using TaskQueues for this purpose on app engine.
Task Queue API