I am trying to scrape this website down below: https://www.kayak-polo.info/kphistorique.php?Group=CE&lang=en
down below is my code. I am trying to actually get the text inside the caption element (as shown on the screenshot). However I believe I cannot find the tag because it has no closing tag and that's why I think it's not returning the text.
For clarity purposes. I already have the tournament name. But I would like the category too which is "men" in the screenshot below
def grab_ranking():
tournament_list = grab_tournament_metadata()
for item in tournament_list:
url_to_scrape = f'https://www.kayak-polo.info/kphistorique.php?Group={item[1]}&lang=en'
response = session.get(url_to_scrape)
print(url_to_scrape)
season_data = response.html.find('body > div.container-fluid > div > article')
for season in season_data:
season_year_raw = find_extract(season, selector='h3 > div.col-md-6.col-sm-6')
season_year = season_year_raw.replace('Season ', '')
print(season_year)
# TODO Figure out how to deal with the n1h and n2h and other french national categories being togheter in one place.
category_table = season.find('div.col-md-3.col-sm-6.col-xs-12', first=True)
umbrella_competition_name = find_extract(category_table, selector='caption')
competition_name = umbrella_competition_name + " " + season_year
I tried multiple things, such as trying to get the HTML of that element and then wanting to a do .split on certain things. However it seems when I do .html I get the entire page's html which doesn't help my case.
I also tried .attrs in the hopes of finding the right tag, but it returns nothing.
Here is one possible solution:
from time import time
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse
def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))
def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
return (f'{url}?Group={_type}&lang=en' for _type in comp_types)
def get_data(competition_url: str, session: HTMLSession) -> None:
response = session.get(competition_url)
print(competition_url)
article_data = response.html.find('article.tab-pane')
for article in article_data:
for data in (i.text.split('\n') for i in article.find('div caption')):
if len(data) > 1:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
else:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")
session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)
start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)
for url in competition_urls:
get_data(url, session)
print(f"Total time: {round(time()-start, 3)}")
The performance of this solution(processing all 4960 elements) is 55 sec
Output:
ECA European Championships - Catania (ITA) 2021 Men
ECA European Championships - Catania (ITA) 2021 Women
ECA European Championships - Catania (ITA) 2021 U21 Men
Solution based on ThreadPoolExecutor:
from time import time
from itertools import repeat
from typing import Generator
from requests_html import HTMLSession
from requests_html import HTMLResponse
from concurrent.futures import ThreadPoolExecutor
def get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:
return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))
def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:
return (f'{url}?Group={_type}&lang=en' for _type in comp_types)
def get_data(competition_url: str, session: HTMLSession) -> None:
response = session.get(competition_url)
print(competition_url)
article_data = response.html.find('article.tab-pane')
for article in article_data:
for data in (i.text.split('\n') for i in article.find('div caption')):
if len(data) > 1:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}\n")
else:
print(f"{data[0]} {article.find('h3')[0].text.split()[1]}\n")
session = HTMLSession()
url = 'https://www.kayak-polo.info/kphistorique.php'
html = session.get(url)
start = time()
competition_types = get_competition_types(html)
competition_urls = get_competition_urls(url, competition_types)
with ThreadPoolExecutor() as executor:
executor.map(get_data, list(competition_urls), repeat(session))
print(f"Total time: {round(time()-start, 3)}")
The performance of this solution(processing all 4960 elements) is ~35 sec
And of course, since in this solution we work with threads all data will be mixed
Output:
European Championships - Sheffield (GBR) 1993 Women
Coupe d'Europe des Nations - Strasbourg (FRA) 1990 Men
European Club Championship - Duisbourg (GER) 2021 Men
I try to extreact data from twitter json file retrived by using tweepy streaming
Here is my code for streaming:
class MyListener(Stream):
t_count=0
def on_data(self, data):
print (data)
self.t_count += 0
#stop by
if self.t_count >= 5000:
sys.exit("exit")
return True
def on_error(self, status):
print (status)
if __name__ == '__main__':
stream = MyListener(consumer_key, consumer_secret, access_token, access_token_secret)
stream.filter(track=['corona'], languages = ["en"])
Here is my code for reading the file:
with open("covid-test-out", "r") as f:
count = 0
for line in f:
data = json.loads(line)
Then I got the error
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Here is one line in the json file. I noticed that there is a b-prefix in front of each line but when I check the type of the line, it is not bytes object but still string object. And I am not even sure if this is the reason that I can not get the correct data.
b'{"created_at":"Mon Nov 22 07:37:46 +0000 2021","id":1462686730956333061,"id_str":"1462686730956333061","text":"RT #corybernardi: Scientists 'mystified'. \n\nhttps:\/\/t.co\/rvTYCUEQ74","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1336870146242056192,"id_str":"1336870146242056192","name":"Terence Byrnes","screen_name":"byrnes_terence","location":null,"url":null,"description":"Retired Aussie. Against mandatory vaccinations, government interference in our lives, and the climate cult. Now on Gab Social as a backup : Terence50","translator_type":"none","protected":false,"verified":false,"followers_count":960,"friends_count":1012,"listed_count":3,"favourites_count":15163,"statuses_count":171876,"created_at":"Thu Dec 10 03:08:01 +0000 2020","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1428994180458508292\/fT2Olt4J_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1428994180458508292\/fT2Olt4J_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1336870146242056192\/1631520259","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Nov 21 19:42:14 +0000 2021","id":1462506658421112834,"id_str":"1462506658421112834","text":"Scientists 'mystified'. \n\nhttps:\/\/t.co\/rvTYCUEQ74","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":80965423,"id_str":"80965423","name":"CoryBernardi.com.au","screen_name":"corybernardi","location":"Adelaide ","url":"http:\/\/www.corybernardi.com.au","description":"Get your free Weekly Dose of Common Sense email at https:\/\/t.co\/MAJpp7iZJy.\n\nLaughing at liars and leftists since 2006. Tweets deleted weekly to infuriate losers.","translator_type":"none","protected":false,"verified":true,"followers_count":47794,"friends_count":63,"listed_count":461,"favourites_count":112,"statuses_count":55,"created_at":"Thu Oct 08 22:54:55 +0000 2009","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1446336496827387904\/Ay6QRHQt_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1446336496827387904\/Ay6QRHQt_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/80965423\/1633668973","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":5,"reply_count":30,"retweet_count":40,"favorite_count":136,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/rvTYCUEQ74","expanded_url":"https:\/\/apnews.com\/article\/coronavirus-pandemic-science-health-pandemics-united-nations-fcf28a83c9352a67e50aa2172eb01a2f","display_url":"apnews.com\/article\/corona\u2026","indices":[26,49]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/rvTYCUEQ74","expanded_url":"https:\/\/apnews.com\/article\/coronavirus-pandemic-science-health-pandemics-united-nations-fcf28a83c9352a67e50aa2172eb01a2f","display_url":"apnews.com\/article\/corona\u2026","indices":[44,67]}],"user_mentions":[{"screen_name":"corybernardi","name":"CoryBernardi.com.au","id":80965423,"id_str":"80965423","indices":[3,16]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1637566666722"}'
I am learning Python and had a question regarding for and if loops. This is my scenario:
I have an endpoint that i make API-call with request.get
I need to retrieve all the historic data
I have a start_date (2017-06-17)
So i need to make multiple API-call because they have a limit of 60-days period. So i made my code like this:
date = datetime.strptime("2017-06-17", "%Y-%m-%d") # Start Date
current_date = date.date() # timedelta need date object so i make it a date object
days_after = (current_date+timedelta(days=60)).isoformat() # days_after is set to 60-days because limit in API
date_string = current_date.strftime('%Y-%m-%d') # made to string again since API need string not date object
So this is how i make the dates for 60 days period. Starting from 2017-06-17 and 60-days ahead.
This is how i make the API-request:
response = requests.get("https://reporting-api/campaign?token=xxxxxxxxxx&format=json&fromDate="+date_string+"&toDate="+days_after)
response_data = response.json() # Added this because i am writing temprorary to a JSON file
This is how i write to JSON file:
if response_data:
print("WE GOT DATA") # Debugging
data = response.json() # This is duplicate?
with open('data.json', 'w') as f: # Open my data.json file as write
json.dump(data, f) # dumps my json-data from API to the file
else:
print("NO DATA") # Debugging if no data / response. Should make a skip statement here
So my question is how can i proceed with my code so that every time i make a API-call starting from 2017-06-17 the date date_string and days_after should go 60 days forward for each API-call and append those data to data.json. I would maybe need some for loops or something?
Please note i have been using Python for 3 days now, be gentle.
Thanks!
You could use a while loop that changes the start and end date until a specified condition is met. Also, you can append the response to a file for every run. the example below I used the date of "today":
import os
from datetime import datetime, timedelta
x = 0
y = 60
date = datetime.strptime("2017-06-17", "%Y-%m-%d")
current_date = date.date()
date_start = current_date+timedelta(days=x)
while date_start < datetime.now().date():
date_start = current_date+timedelta(days=x)
days_after = current_date+timedelta(days=y)
x = x + 60
y = y + 60
response = requests.get("https://reporting-api/campaign?token=xxxxxxxxxx&format=json&fromDate="+date_start.isoformat() +"&toDate="+days_after.isoformat())
response_data = response.json()
if response_data:
print("WE GOT DATA")
data = response.json()
#create a file if not exists or append new data to it.
if os.path.exists('data.json'):
append_write = 'a' # append if already exists
else:
append_write = 'w' # make a new file if not
with open('data.json', append_write) as f:
json.dump(data, f)
else:
print("NO DATA")
Basically, on every run the time of start and end is increased by 60 days and appended to the data.json file.
I've been using the script below to download technical videos for later analysis. The script has worked well for me and retrieves the highest resolution version available for the videos that I have needed.
Now I've come across a 4K YouTube video, and my script only saves an mp4 with 1280x720.
I'd like to know if there is a way to adjust my current script to download higher resolution versions of this video. I understand there are python packages that might address this, but right now I would like stick to this step-by-step method if possible.
above: info from Quicktime and OSX
"""
length: 175 seconds
quality: hd720
type: video/mp4; codecs="avc1.64001F, mp4a.40.2"
Last-Modified: Sun, 21 Aug 2016 10:41:48 GMT
Content-Type: video/mp4
Date: Sat, 01 Apr 2017 16:50:16 GMT
Expires: Sat, 01 Apr 2017 16:50:16 GMT
Cache-Control: private, max-age=21294
Accept-Ranges: bytes
Content-Length: 35933033
Connection: close
Alt-Svc: quic=":443"; ma=2592000
X-Content-Type-Options: nosniff
Server: gvs 1.
"""
import urlparse, urllib2
vid = "vzS1Vkpsi5k"
save_title = "YouTube SpaceX - Booster Number 4 - Thaicom 8 06-06-2016"
url_init = "https://www.youtube.com/get_video_info?video_id=" + vid
resp = urllib2.urlopen(url_init, timeout=10)
data = resp.read()
info = urlparse.parse_qs(data)
title = info['title']
print "length: ", info['length_seconds'][0] + " seconds"
stream_map = info['url_encoded_fmt_stream_map'][0]
vid_info = stream_map.split(",")
mp4_filename = save_title + ".mp4"
for video in vid_info:
item = urlparse.parse_qs(video)
print 'quality: ', item['quality'][0]
print 'type: ', item['type'][0]
url_download = item['url'][0]
resp = urllib2.urlopen(url_download)
print resp.headers
length = int(resp.headers['Content-Length'])
my_file = open(mp4_filename, "w+")
done, i = 0, 0
buff = resp.read(1024)
while buff:
my_file.write(buff)
done += 1024
percent = done * 100.0 / length
buff = resp.read(1024)
if not i%1000:
percent = done * 100.0 / length
print str(percent) + "%"
i += 1
break
Ok, so I have not taken the time to get to the bottom of this. However, I did find that when you do:
stream_map = info['url_encoded_fmt_stream_map'][0]
Somehow you only get a selection of a single 720p option, one 'medium' and two 'small'.
However, if you change that line into:
stream_map = info['adaptive_fmts'][0]
you will get all the available versions, including the 2160p one. Thus, the 4K one.
PS: You'd have to comment out the print quality and print type command since those labels aren't always available in the new throughput. When commenting them out however, and adapting your script as explained above, I was able to successfully download the 4K version.
indeed
info ['adaptive_fmts'] [0]
returns the information of the whole video, but the url is not usable directly , but the bar of advancement
Need to read Pylons session data (just read, not write to) in node.js
Once I decode the base64, I'm left with a string containing a serialized Python object which, is a pain to parse in node.js
How can I get Beaker to serialize to JSON instead? For it is far easier for node.js to handle.
i had to look inside beaker to find what you call "Python serialized strings" are python pickles.
i don't think it would be more than a few lines to change it get it to use json to store the dict.
here is a patch against https://bitbucket.org/bbangert/beaker/src/257f147861c8:
diff -r 257f147861c8 beaker/session.py
--- a/beaker/session.py Mon Apr 18 11:38:53 2011 -0400
+++ b/beaker/session.py Sat Apr 30 14:19:12 2011 -0400
## -489,10 +489,10 ##
nonce = b64encode(os.urandom(40))[:8]
encrypt_key = crypto.generateCryptoKeys(self.encrypt_key,
self.validate_key + nonce, 1)
- data = util.pickle.dumps(self.copy(), 2)
+ data = util.json.dumps(self.copy())
return nonce + b64encode(crypto.aesEncrypt(data, encrypt_key))
else:
- data = util.pickle.dumps(self.copy(), 2)
+ data = util.json.dumps(self.copy())
return b64encode(data)
def _decrypt_data(self):
## -504,10 +504,10 ##
self.validate_key + nonce, 1)
payload = b64decode(self.cookie[self.key].value[8:])
data = crypto.aesDecrypt(payload, encrypt_key)
- return util.pickle.loads(data)
+ return util.json.loads(data)
else:
data = b64decode(self.cookie[self.key].value)
- return util.pickle.loads(data)
+ return util.json.loads(data)
def save(self, accessed_only=False):
"""Saves the data for this session to persistent storage"""
diff -r 257f147861c8 beaker/util.py
--- a/beaker/util.py Mon Apr 18 11:38:53 2011 -0400
+++ b/beaker/util.py Sat Apr 30 14:19:12 2011 -0400
## -24,6 +24,11 ##
import pickle
else:
import cPickle as pickle
+
+try:
+ import json
+except ImportError:
+ import simplejson as json
from beaker.converters import asbool
from beaker import exceptions