Here is my code:
import base64
import requests
import json
import csv
USERNAME, PASSWORD = 'Username', 'Password'
req = requests.get(
url="https://api.mysportsfeeds.com/v1.1/pull/nhl/2017-2018-regular/game_startinglineup.json?gameid=20171109-EDM-NJD",
params={
"fordate": "20171009"
},
headers={
"Authorization": "Basic " +
base64.b64encode('{}:{}'.format(USERNAME,PASSWORD)\
.encode('utf-8')).decode('ascii')
}
)
data = req.json()
for i in range(1):
team_home = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
for i in range(2):
team_away = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
#headers = ["HomeTeam", "AwayTeam"]
for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data =[]
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Here is what i am requesting:
{"gamestartinglineup":{"lastUpdatedOn":"2017-12-12 11:56:50 PM","game":{"id":"41009","date":"2017-11-09","time":"7:00PM","awayTeam":{"ID":"24","City":"Edmonton","Name":"Oilers","Abbreviation":"EDM"},"homeTeam":{"ID":"7","City":"New Jersey","Name":"Devils","Abbreviation":"NJD"},"location":"Prudential Center"},"teamLineup":[{"team":{"ID":"24","City":"Edmonton","Name":"Oilers","Abbreviation":"EDM"},"expected":{"starter":[{"position":"Goalie-Backup","player":{"ID":"5552","LastName":"Brossoit","FirstName":"Laurent","JerseyNumber":"1","Position":"G"}},{"position":"ForwardLine1-RW","player":{"ID":"4854","LastName":"Maroon","FirstName":"Patrick","JerseyNumber":"19","Position":"LW"}},{"position":"ForwardLine2-C","player":{"ID":"4993","LastName":"Nugent-Hopkins","FirstName":"Ryan","JerseyNumber":"93","Position":"C"}},{"position":"ForwardLine4-C","player":{"ID":"4730","LastName":"Letestu","FirstName":"Mark","JerseyNumber":"55","Position":"C"}},{"position":"ForwardLine3-LW","player":{"ID":"11308","LastName":"Caggiula","FirstName":"Drake","JerseyNumber":"91","Position":"LW"}},{"position":"ForwardLine4-RW","player":{"ID":"5875","LastName":"Khaira","FirstName":"Jujhar","JerseyNumber":"16","Position":"LW"}},{"position":"ForwardLine3-C","player":{"ID":"3637","LastName":"Jokinen","FirstName":"Jussi","JerseyNumber":"36","Position":"LW"}},{"position":"ForwardLine3-RW","player":{"ID":"4997","LastName":"Strome","FirstName":"Ryan","JerseyNumber":"18","Position":"C"}},{"position":"ForwardLine1-C","player":{"ID":"5576","LastName":"McDavid","FirstName":"Connor","JerseyNumber":"97","Position":"C"}},{"position":"ForwardLine1-LW","player":{"ID":"5417","LastName":"Draisaitl","FirstName":"Leon","JerseyNumber":"29","Position":"C"}},{"position":"ForwardLine2-LW","player":
My Output is: [['DefensePair1-R', 'Klefbom', 'DefensePair1-R', 'Severson']]
I created for loops for home and away and also the players within them. There are 20 per Home and Away. But it is only returning just 18.. I thought the expression is 0-18 not only return 18 so there must be something else in the script that is poorly written.
for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data =[] ## HERE!
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Bring that out of the loop..
my_data =[] ## HERE!
for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Edit:
Found another snippet, possibly an error
for i in range(2):
team_away = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
In this snippet, you are overwriting the team_away. Not sure if you would like that
Related
I'm stuck on a little problem and hope you can help.
I want to create a df by scraping from two parts of a web page. I seem to be stuck on the second part.
My requirement is to get a df with each Horse name and the associated odds.
eg.
Horse Odds
name1 odd1
name2 odd2
I've used a sample page in the script but it will be the same for any
: base url https://www.racingtv.com/racecards/tomorrow
: then select any time to get another page with the horse name and odds details etc.
import requests
import pandas as pd
from bs4 import BeautifulSoup
def main():
# base url is https://www.racingtv.com/racecards/tomorrow
# select any time to get the horse name and odds details etc.
url = 'https://www.racingtv.com/racecards/catterick-bridge/372180-watch-racing-tv-now-novices-hurdle-gbb-race?'
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")
strike = soup.select('div', class_='data-strike-out-group')
# this bit seems to be working
for data in soup.find_all('div',
class_='racecard__runner__column racecard__runner__name'):
for a in data.find_all('a'):
print(a.text)
# this bit sort of works but it seems to repeat the first three items of data
for odds in soup.find_all('div',
class_='racecard__runner__column racecard__runner__column--price'):
for odd1 in odds.find_all('ruk-odd'):
print(odd1.text)
# I tried this to work out how to stop getting the three duplicates but it does not work
for odds in strike.select('div',
class_='racecard__runner__column racecard__runner__column--price'):
for odd1 in odds.find_all('ruk-odd'):
print(odd1.text)
return
if __name__ == '__main__':
main()
class_='data-strike-out-group'
this isn't a class, check the raw html. It's an attribute of the div... weird
Glad you posted this, might end up using this site for a personal project. Figured you'd be interested in this code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = 'https://www.racingtv.com/racecards/catterick-bridge/372180-watch-racing-tv-now-novices-hurdle-gbb-race?'
resp = requests.get(url,headers=headers)
print(resp)
soup = BeautifulSoup(resp.text,'html.parser')
table = soup.find('div',{'class':'page__content__section racecard'})
race_id = url.split('/')[-1].split('-')[0]
race_name = soup.find('div',class_='race__name').text.strip()
race_date = soup.find('div',class_='race__date').text.strip()
clean_date = datetime.strptime(race_date,'%d %b %Y').strftime('%Y%m%d')
race_info1 = soup.find_all('div',class_='race__subtitle')[0].text.strip()
race_info2 = soup.find_all('div',class_='race__subtitle')[1].text.strip()
final = []
for row in table.find_all('div',class_='racecard__runner--content'):
try:
num = row.find('div',class_='racecard__runner__cloth-number').text.strip()
last_days_ugly = row.find('div',class_='racecard__runner__name').find('a').find('sup').text
horse_name = row.find('div',class_='racecard__runner__name').find('a').text.strip().replace(last_days_ugly,'')
horse_link = 'http://www.racingtv.com'+row.find('div',class_='racecard__runner__name').find('a')['href']
last_race_days = last_days_ugly.strip().replace('(','').replace(')','')
for people in row.find_all('div',class_='racecard__runner__person'):
if 'J:' in people.getText():
jockey = people.find('a').text.strip()
jockey_link = 'http://www.racingtv.com'+people.find('a')['href']
if 'T:' in people.getText():
trainer = people.find('a').text.strip()
trainer_link = 'http://www.racingtv.com'+people.find('a')['href']
form = row.find('div',class_='racecard__runner__column--form_lr').find_all('div')[0].text.strip()
equip = row.find('div',class_='racecard__runner__column--form_lr').find_all('div')[1].text.strip()
weight = row.find('div',class_='racecard__runner__column--weight_age').find_all('div')[0].text.strip()
age = row.find('div',class_='racecard__runner__column--weight_age').find_all('div')[1].text.strip()
o_r = row.find('div',class_='racecard__runner__column--or').text.strip()
odds = row.find('div',class_='racecard__runner__column--price').getText()
odds_dec = row.find('div',class_='racecard__runner__column--price').find('ruk-odd')['data-js-odds-decimal']
odds_data = row.find('div',class_='racecard__runner__column--price').find('ruk-odd')['data-js-odd-alternatives']
except AttributeError: #skip blank starting gates
continue
item = {
'race_url' : url,
'race_id': race_id,
'race_name':race_name,
'race_date':clean_date,
'race_info1':race_info1,
'race_info2':race_info2,
'num': num,
'horse_name':horse_name,
'horse_link':horse_link,
'last_race_days':last_race_days,
'jockey':jockey,
'jockey_link':jockey_link,
'trainer':trainer,
'trainer_link':trainer_link,
'form':form,
'equip':equip,
'weight':weight,
'age':age,
'o_r':o_r,
'odds':odds,
'odds_dec':odds_dec,
'odds_data':odds_data
}
final.append(item)
df = pd.DataFrame(final)
df.to_csv('racingtv.csv',index=False)
print('Saved to racingtv.csv')
Following on from the script supplied kindly by bushcat69 and my subsequent question "how to get the race time into the df" I have cobbled together some code (cut and paste from other sites). I thought you may be interested. It may not be elegant but it seems to work. The section:
race_data.extend(get_racecards_data(url_race, date, racetime
is used to pass the url etc to the bushcat69 script.
Thanks again.
def get_meetings():
global date
global date_ext
odds_date = date_ext
url = f'https://www.racingtv.com/racecards/{date_ext}'
try:
res = requests.get(url, headers = headers)
except:
print('Date or Connection error occured! \nTry again!!')
return
soup = BeautifulSoup(res.text, 'html.parser')
meetings = soup.select('.race-selector__times__race')
course_num = len(meetings)
meetings1 = [a['href'] for a in soup.select('.race-selector__times__race')]
course_num = len(meetings1)
cnt01 = 0
if course_num == 0:
print('Provide a upcoming valid date')
return
for track in meetings1[:course_num]:
cnt01 = cnt01 + 1
trackref = track.split("/")[2]
print(cnt01, ": ", trackref)
need = input(f'{course_num} courses found \nHow many courses to scrape? Press \'a\' for all :\n')
if need == 'a':
n = course_num
else:
try:
n = int(need)
except:
print('Invalid input !')
return
cnt01 = 0
race_data = []
for mtm in meetings[:course_num]:
cnt01 = cnt01 + 1
racetime = mtm.text
href = mtm.attrs
htxt = Text(href)
url_race = htxt.partition("/")[2]
url_race = "/" + url_race.rpartition("'")[0]
print(cnt01, racetime, url_race)
time.sleep(1)
race_data.extend(get_racecards_data(url_race, date, racetime))
print(f"Meeting {url_race.split('/')[2]} scraping completed")
if cnt01 == n:
break
df_race = pd.DataFrame(race_data)
df = df_race
I'm trying to run a Python code where I'm calling an API to extract the data and upload to a CSV file. The CSV file is extracting fine, but throws an error. Can someone please let me know what I might be dong wrong here?
Error Message:
Code:
import http.client
import json
import csv
import os
conn = http.client.HTTPSConnection("api.betterimpact.com")
conn1 = http.client.HTTPSConnection("api.betterimpact.com")
if os.path.exists("CSVVolunteerOutput.csv"):
os.remove("CSVVolunteerOutput.csv")
headers = {
'Authorization': 'Basic XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'Cookie': '; TrackingClientId=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
}
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false", headers=headers)
res = conn.getresponse()
data = json.load(res)
conn1.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number=0", headers = headers)
res1 = conn1.getresponse()
data1 = json.load(res1)
if data == None or data == "" or len(data) == 0:
print("Check API Credentials..")
exit()
volunteer_status = "Accepted"
pageNum = 0
_page_count = data1['header']['page_count']
while True:
pageNum+=1
with open('CSVVolunteerOutput.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(["SyncID", "FirstName", "LastName", "Jobtitle", "Division", "BusinessUnit", "RegionArea", "WorkLocation", "Department", "WorkEmailAddressPrimary",
"PersonalEmailAddress", "PersonalMobilePhonePrimary", "WorkCountry"])
for user in data['users']:
_id = user['user_id']
_firstName = user['first_name']
_surName = user['last_name']
_emailAddress = user['email_address']
_emailAddressSec = user['secondary_email_address']
_cellPhone = user['cell_phone']
_country = user['country']
for details in user['memberships']:
_orgName = details['organization_name']
_volunteerStatus = details['volunteer_status']
if volunteer_status == _volunteerStatus:
writer.writerow([_id, _firstName, _surName, "Volunteer", "", "", "", _orgName, "", _emailAddress,
_emailAddressSec, _cellPhone, _country])
if pageNum > int(_page_count):
break
else:
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number="+str(pageNum), headers=headers)
res = conn.getresponse()
data = json.load(res)
print("CSV file created successfully")
API Documentation is here: https://www.betterimpact.com/volunteer-impact-help/it-api/
Thanks.
I can't run code so I'm guessing.
You have data = ... in two places:
before while-loop
inside while-loop
like this
# --- first `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
# ... code ...
while True:
# ... code ...
for user in data['users']:
# ... code ...
if pageNum > int(_page_count):
break
else:
# --- second `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
It seems you checked users in data only after first data = ... but you didn't check it for second data = ... but it can gives you data without users.
You could check both in one place
if "users" in data:
for user in data['users']:
# ... code ...
By the way: if you want to append to file in loop then better append headers before loop. I current version you add headers before every row with data. OR you could append all data to list and after loop write all at once
I'm trying to send a "scammer" a bunch of fake email and passwords but i get this output:
Milan
91#protonmail.com
this is the script:
import requests
import random
val = 1
url = 'https://repens.serveo.net/login.php'
while val == 1:
file = open("/home/user/Documents/scam/names.json").readlines()
random_name = random.choice(file)
random_number = random.randint(0, 99)
email_provider = ["#yahoo.com", "#gmail.com", "#walla.com"]
random_email_provider = random.choice(email_provider)
name = random_name
username = "%s%s%s" % (name, random_number, random_email_provider)
password = random.randint(0, 9999999)
print(username)
requests.post(url, allow_redirects=False, data={
'username': username,
'password': password })
this is what my names file look like:
Liam
Noah
William
James
Logan
I also tried:
[
"Liam",
"Noah",
"William",
"James",
"Logan",
]
to get the 3 strings appended in a single line, you need to get the name without the next line character. You are getting the name from a file which is split by readlines(). You can put the names in a space separated format so you can get your names like this:
names = list(map(file.read().split()))
you could also do something like this for getting your names from the file:
with open('data.txt', 'r') as myfile:
data=myfile.read().replace('\n', '')
reference : How to read a text file into a string variable?
output
import requests
import random
import string
import json
import os
# setting variables
chars = string.ascii_letters + string.digits + '!##$%^&*()'
random.seed = (os.urandom(1024))
# setting up different domain extensions
domain = ['.com', '.gov', '.us', '.edu', '.org', '.ru', '.tw', '.live', '.io', '.blog', '.biz', '.blog', '.co']
# url to overflow
url = 'https://www.stealmylogin.com/demo.html'
# loading and reading json files
names = json.loads(open('names.json').read())
org = json.loads(open('domain.json').read())
# setting up random users and password
for name in names:
name_extra = ''.join(random.choice(string.digits))
userId = name.lower() + name_extra + '#' + random.choice(org) + random.choice(domain)
userPassword = ''.join(random.choice(chars) for i in range(8))
# sending user/password to url above
requests.post(url, allow_redirects=False, data={
'auid2yjauysd2uasdasdasd': userId,
'kjauysd6sAJSDhyui2yasd': userPassword
})
# print the results - example: sending username noah8#HIPHOP.edu and password ankCRzk8
print('sending username %s and password %s' % (userId, userPassword)).
The script I have is exporting all users but I am looking to export users who have a type = xyz. There are two types of users in the directory such as type a and type b and i only want to export users who have type attribute matches b.
Please help me to add a clause/statement in the script so it should only pull users with Type "B" and ignore other users with ant other type.
import requests
import json
import re
import sys
import csv
orgName = ""
apiKey = ""
api_token = "SSWS "+ apiKey
headers = {'Accept':'application/json','Content-Type':'application/json','Authorization':api_token}
def GetPaginatedResponse(url):
response = requests.request("GET", url, headers=headers)
returnResponseList = []
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
if "errorCode" in responseJSON:
print "\nYou encountered following Error: \n"
print responseJSON
print "\n"
return "Error"
else:
headerLink= response.headers["Link"]
while str(headerLink).find("rel=\"next\"") > -1:
linkItems = str(headerLink).split(",")
nextCursorLink = ""
for link in linkItems:
if str(link).find("rel=\"next\"") > -1:
nextCursorLink = str(link)
nextLink = str(nextCursorLink.split(";")[0]).strip()
nextLink = nextLink[1:]
nextLink = nextLink[:-1]
url = nextLink
response = requests.request("GET", url, headers=headers)
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
headerLink= response.headers["Link"]
returnJSON = json.dumps(returnResponseList)
return returnResponseList
def DownloadSFUsers():
url = "https://"+orgName+".com/api/v1/users"
responseJSON = GetPaginatedResponse(url)
if responseJSON != "Error":
userFile = open("Only-Okta_Users.csv", "wb")
writer = csv.writer(userFile)
writer.writerow(["login","type"]).encode('utf-8')
for user in responseJSON:
login = user[u"profile"][u"login"]
type = user[u"credentials"][u"provider"][u"type"]
row = ("+login+","+type).encode('utf-8')
writer.writerow([login,type])
if __name__ == "__main__":
DownloadSFUsers()
Wrap your statement that writes a user to the csv file in an if statement that tests for the correct type.
I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?