I have this function that fecthes a bunch of images:
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page=0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(player['name']['display'], img_url.format(player_id=player['altIds']['opta'])))
sleep(2)
page+=1
except:
break
How do I dinamically save each image on a 'path/to/image' folder with player['name'].png format?
Here you go :)
import requests
from time import sleep
import urllib.request
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page = 0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(
player['name']['display'],
img_url.format(player_id=player['altIds']['opta'])))
urllib.request.urlretrieve(
img_url.format(player_id=player['altIds']['opta']),
player['name']['display'] + ".png")
sleep(2)
page += 1
except:
break
Related
The code below creates the sitemap but it is including 404 error URL's. How do I exclude them from the sitemap?
from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date
tree = sitemap_tree_for_homepage('')
root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for page in tree.all_pages():
url = page.url
prio = json.dumps(page.priority, use_decimal=True)
# format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
lm = date.today().strftime("%Y-%m-%d")
cf = page.change_frequency.value
urlel = ET.SubElement(root, "url")
ET.SubElement(urlel, "loc").text = url
ET.SubElement(urlel, "lastmod").text = lm
ET.SubElement(urlel, "changefreq").text = cf
ET.SubElement(urlel, "priority").text = prio
ET.indent(root, " ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )
You have to use (external module) requests or (built-in module) urllib.request to check if url gives status 404
import urllib.request
import urllib.error
url = 'https://stackoverflow.com/fake_url' # wrong url
#url = 'https://stackoverflow.com/' # correct url
try:
r = urllib.request.urlopen(url)
print('adding url:', url)
# ... add `url` to sitemap ...
except urllib.error.HTTPError as ex:
print('ex:', ex)
print('wrong url:', url)
or
import requests
url = 'https://stackoverflow.com/fake_url'
#url = 'https://stackoverflow.com/'
response = requests.get(url)
#if response.status_code != 404:
if response.status_code == 200:
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
EDIT:
You could put it as function and return True/False
import urllib.request
import urllib.error
def is_correct_url(url):
try:
r = urllib.request.urlopen(url)
return True
except urllib.error.HTTPError as ex:
print('ex:', ex)
return False
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
import requests
def is_correct_url(url):
response = requests.get(url)
#return response.status_code != 404:
return response.status_code == 200:
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
This way only the non-broken links will be added
from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date
import requests
tree = sitemap_tree_for_homepage('')
root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for page in tree.all_pages():
url = page.url
response = requests.get(url)
if response.status_code == 200:
prio = json.dumps(page.priority, use_decimal=True)
# format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
lm = date.today().strftime("%Y-%m-%d")
cf = page.change_frequency.value
urlel = ET.SubElement(root, "url")
ET.SubElement(urlel, "loc").text = url
ET.SubElement(urlel, "lastmod").text = lm
ET.SubElement(urlel, "changefreq").text = cf
ET.SubElement(urlel, "priority").text = prio
ET.indent(root, " ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )
My code is downloading the files perfectly, but it stops when the server returns a 404 error. How do I change this code to add only URLs that don't have a 404 error to the list?
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
artigos = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
artigos.append(link['href'])
for x in artigos:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂdo. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
baixa_arquivo(url_basica.format(a), nome_do_arquivo)
I was able to fix the problem by adding a condition at the end of the code. I requested the status_code and canceled the download if the status was 404.
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
artigos = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
artigos.append(link['href'])
for x in artigos:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂdo. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(550, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
I'm trying to extract data from this API:https://www.balldontlie.io/#get-all-stats with the following code in python:
import requests
import json
import time
total_results = []
pages_to_read = 11000
counter = 0
for page_num in range(1, pages_to_read + 1):
url = "https://balldontlie.io/api/v1/stats?per_page=100&page=" + str(page_num)
print("reading", url)
response = requests.get(url)
data = response.json()
total_results = total_results + data['data']
counter = counter+ 1
print(counter)
if counter == 59:
counter = 0
print('break')
time.sleep(60)
print("Total of", len(total_results), "results")
with open('test.json', 'w', encoding='utf-8') as d:
json.dump(total_results, d, ensure_ascii=False, indent=4)
However, I always get this error: https://cdn1.gnarususercontent.com.br/1/292460/5c2858ca-33df-4bd8-9b44-0d50a48ab3e0.png
The API should support 60 requests per second, sometimes it even goes beyond 60, but in the end it always gets this error. Does anyone have any suggestions to help me?
PS: I would need data from all 11000 pages of 'stats'. Only the json 'data' data of the page, not counting the 'meta data' and the page number.
Found this answer:
import requests
import json
import time
from datetime import datetime
resultados_totais = []
paginas_totais_para_ler = 11486
def get_api_data(page_num):
url = "https://balldontlie.io/api/v1/stats?per_page=100&page=" + str(page_num)
current_time = datetime.now().strftime("%H:%M:%S")
print("{} - Lendo {}".format(current_time, url))
start_request = datetime.now()
response = requests.get(url)
end_request = datetime.now()
delta = end_request - start_request
if delta.seconds <= 0:
time.sleep(1)
if response.status_code == 200:
data = response.json()
response.raise_for_status()
return data['data']
else:
current_time = datetime.now().strftime("%H:%M:%S")
print("{} - ({}) Limite de request ... aguardando 15 seg para tentar novamente".format(current_time, response.status_code))
time.sleep(15)
return get_api_data(page_num)
for page_num in range(1, paginas_totais_para_ler + 1):
resultados_totais.extend(get_api_data(page_num))
if page_num == paginas_totais_para_ler:
print('\nSalvando Arquivo json..')
with open('all_stats.json', 'w', encoding='utf-8') as d:
json.dump(resultados_totais, d, ensure_ascii=False, indent=4)
print('\nArquivo Salvo!!')
print("\nTemos um total de", len(resultados_totais), "resultados.")
I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
i am working on a Tool which can post Ingame News Updates from Games on your Twitter Account, in the last few Days I searched for an solution to find a way to look if an image is already in the cache so it wont be posted again if a News Feed Update of an Game is online, it should only post the new ones and skip the old ones (actually it posts every active News Feed again), the problem is can't do it. I tested it about 100 Times but it wont work. I really hope that one of you can help me with this issue because it would be fantastic if this Tool would work with an method like this. Thanks for every single help in advance.
Here is my code:
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)
You need to check each motd to see if it existed in the old dataset.
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
if i in newsData["motds"]:
# has already been posted
print("Already posted")
continue
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)