Issue with Python - KeyError

Issue with Python - KeyError - python

I'm trying to run a Python code where I'm calling an API to extract the data and upload to a CSV file. The CSV file is extracting fine, but throws an error. Can someone please let me know what I might be dong wrong here?
Error Message:
Code:
import http.client
import json
import csv
import os
conn = http.client.HTTPSConnection("api.betterimpact.com")
conn1 = http.client.HTTPSConnection("api.betterimpact.com")
if os.path.exists("CSVVolunteerOutput.csv"):
os.remove("CSVVolunteerOutput.csv")
headers = {
'Authorization': 'Basic XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'Cookie': '; TrackingClientId=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
}
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false", headers=headers)
res = conn.getresponse()
data = json.load(res)
conn1.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number=0", headers = headers)
res1 = conn1.getresponse()
data1 = json.load(res1)
if data == None or data == "" or len(data) == 0:
print("Check API Credentials..")
exit()
volunteer_status = "Accepted"
pageNum = 0
_page_count = data1['header']['page_count']
while True:
pageNum+=1
with open('CSVVolunteerOutput.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(["SyncID", "FirstName", "LastName", "Jobtitle", "Division", "BusinessUnit", "RegionArea", "WorkLocation", "Department", "WorkEmailAddressPrimary",
"PersonalEmailAddress", "PersonalMobilePhonePrimary", "WorkCountry"])
for user in data['users']:
_id = user['user_id']
_firstName = user['first_name']
_surName = user['last_name']
_emailAddress = user['email_address']
_emailAddressSec = user['secondary_email_address']
_cellPhone = user['cell_phone']
_country = user['country']
for details in user['memberships']:
_orgName = details['organization_name']
_volunteerStatus = details['volunteer_status']
if volunteer_status == _volunteerStatus:
writer.writerow([_id, _firstName, _surName, "Volunteer", "", "", "", _orgName, "", _emailAddress,
_emailAddressSec, _cellPhone, _country])
if pageNum > int(_page_count):
break
else:
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number="+str(pageNum), headers=headers)
res = conn.getresponse()
data = json.load(res)
print("CSV file created successfully")
API Documentation is here: https://www.betterimpact.com/volunteer-impact-help/it-api/
Thanks.

I can't run code so I'm guessing.
You have data = ... in two places:
before while-loop
inside while-loop
like this
# --- first `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
# ... code ...
while True:
# ... code ...
for user in data['users']:
# ... code ...
if pageNum > int(_page_count):
break
else:
# --- second `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
It seems you checked users in data only after first data = ... but you didn't check it for second data = ... but it can gives you data without users.
You could check both in one place
if "users" in data:
for user in data['users']:
# ... code ...
By the way: if you want to append to file in loop then better append headers before loop. I current version you add headers before every row with data. OR you could append all data to list and after loop write all at once

Related

I have been trying to create a csv file from data recieved from a web scraper

as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)

Apply the code for smaller batches in the data set sequentially

I have data set of retrieved tweets via the Twitter streaming API.
However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:
def create_url():
tweet_fields = "tweet.fields=public_metrics"
tweets_data_path = 'dataset.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
ids = "ids=" + df_id
url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
return url
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
r.headers["User-Agent"] = "v2TweetLookupPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
return response.json()
def main():
url = create_url()
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=3, sort_keys=True))
if __name__ == "__main__":
main()
Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that?
Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?

You can chunk your data and send it in batches using itertools.islice.
test.py:
import reprlib
from itertools import islice
import pandas as pd
BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100
def req(ids):
tmp = reprlib.repr(ids) # Used here just to shorten the output
print(f"{BASE_URL}?ids={tmp}")
def main():
df = pd.DataFrame({"id": range(1000)})
it = iter(df["id"])
while chunk := tuple(islice(it, CHUNK)):
ids = ",".join(map(str, chunk))
req(ids)
if __name__ == "__main__":
main()
Test:
$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'
Note: You'll make multiple requests with this approach so keep in mind any rate limits.

KeyError while crawling websites for keyword and status

I'm currently trying to put two things together when checking multiple websites from my input CSV file:
Check HTTP status
Check if Website displays specific keyword
then save the results to a new CSV file.
My input.csv:
id url
1 https://example123.com
2 https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/
3 https://mundoshoponline.com
My Code:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('path/to/my/input.csv')
#my csv has urls in the 1st column
urls = df.T.values.tolist()[1]
results = {}
status = []
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
status.append("Down")
except requests.exceptions.HTTPError:
status.append("Other")
else:
status.append("OK")
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
#mark x if there are any hits for specific keyword
for url in results:
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
openingList.append("x" if len(results[url]["Opening Soon"]) > 0 else "")
forbiddenList.append("x" if len(results[url]["Forbidden"]) > 0 else "")
notfoundList.append("x" if len(results[url]["Page not found"]) > 0 else "")
underList.append("x" if len(results[url]["Under Construction"]) > 0 else "")
currentlyList.append("x" if len(results[url]["Currently Unavailable"]) > 0 else "")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = status
print(df)
df.to_csv('path/to/my/output.csv', index=False)
However, whenever I run the above script with for url in urls:
for some of my urls it throws this error and script breaks and output.csv is not generated:
Traceback (most recent call last):
File "path/to/myscan.py", line 51, in <module>
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
KeyError: 'http://example123.com'
and when running it with for url in results: output.csv is as follows:
[![enter image description here][1]][1]
seems erroneous as first row has keywords marked as present (comingSoon, underConstruction columns) + status column = Down. But website doesn't contain 'coming soon' or 'under construction' strings.
Would someone be able to help me with this? I believe there might be an issue in my loop or try/except part of the code. I'm happy to provide more information if the above is not sufficient. Thank you in advance.

I think your main problem is that you are iterating over the whole urls which some of which may have failed and therefore does not exist in your results as a key.
A much safer way to do this is to iterate over the subset of urls that you are sure have succeeded and have a key in results, so instead of
for url in urls:
you could make it
for url in results:
To make the final results consistent with the input order of your urls:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('./input.csv')
#my csv has urls in the 4th column
urls = [ 'example123.com', 'https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/', 'http://alotechgear.com']
results = {}
status = {}
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Down"
except requests.exceptions.HTTPError:
status[url] = "Other"
else:
status[url] = "OK"
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
statusList = []
#mark x if there are any hits for specific keyword
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
notfoundList.append("x")
comingList.append("-")
openingList.append("-")
forbiddenList.append("-")
underList.append("-")
currentlyList.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
forbiddenList.append("x" if len(results[url].get("Forbidden")) > 0 else "-")
notfoundList.append("x" if len(results[url].get("Page not found")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
currentlyList.append("x" if len(results[url].get("Currently Unavailable")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
print(df)
df.to_csv('./output.csv', index=False)
sample result:
id url comingSoon openingSoon forbidden notfound2 underConstruction currentlyUnavailable status
0 1 https://example123.com - - - x - - Down
1 2 https://envato.com/blog/30-outstanding-c... x - - - x - OK
2 3 https://mundoshoponline.com - - - x - - Down

Export users with criteria - Python script

The script I have is exporting all users but I am looking to export users who have a type = xyz. There are two types of users in the directory such as type a and type b and i only want to export users who have type attribute matches b.
Please help me to add a clause/statement in the script so it should only pull users with Type "B" and ignore other users with ant other type.
import requests
import json
import re
import sys
import csv
orgName = ""
apiKey = ""
api_token = "SSWS "+ apiKey
headers = {'Accept':'application/json','Content-Type':'application/json','Authorization':api_token}
def GetPaginatedResponse(url):
response = requests.request("GET", url, headers=headers)
returnResponseList = []
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
if "errorCode" in responseJSON:
print "\nYou encountered following Error: \n"
print responseJSON
print "\n"
return "Error"
else:
headerLink= response.headers["Link"]
while str(headerLink).find("rel=\"next\"") > -1:
linkItems = str(headerLink).split(",")
nextCursorLink = ""
for link in linkItems:
if str(link).find("rel=\"next\"") > -1:
nextCursorLink = str(link)
nextLink = str(nextCursorLink.split(";")[0]).strip()
nextLink = nextLink[1:]
nextLink = nextLink[:-1]
url = nextLink
response = requests.request("GET", url, headers=headers)
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
headerLink= response.headers["Link"]
returnJSON = json.dumps(returnResponseList)
return returnResponseList
def DownloadSFUsers():
url = "https://"+orgName+".com/api/v1/users"
responseJSON = GetPaginatedResponse(url)
if responseJSON != "Error":
userFile = open("Only-Okta_Users.csv", "wb")
writer = csv.writer(userFile)
writer.writerow(["login","type"]).encode('utf-8')
for user in responseJSON:
login = user[u"profile"][u"login"]
type = user[u"credentials"][u"provider"][u"type"]
row = ("+login+","+type).encode('utf-8')
writer.writerow([login,type])
if __name__ == "__main__":
DownloadSFUsers()

Wrap your statement that writes a user to the csv file in an if statement that tests for the correct type.

Python request returning only 1 element

Here is my code:
import base64
import requests
import json
import csv
USERNAME, PASSWORD = 'Username', 'Password'
req = requests.get(
url="https://api.mysportsfeeds.com/v1.1/pull/nhl/2017-2018-regular/game_startinglineup.json?gameid=20171109-EDM-NJD",
params={
"fordate": "20171009"
},
headers={
"Authorization": "Basic " +
base64.b64encode('{}:{}'.format(USERNAME,PASSWORD)\
.encode('utf-8')).decode('ascii')
}
)
data = req.json()
for i in range(1):
team_home = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
for i in range(2):
team_away = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
#headers = ["HomeTeam", "AwayTeam"]
for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data =[]
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Here is what i am requesting:
{"gamestartinglineup":{"lastUpdatedOn":"2017-12-12 11:56:50 PM","game":{"id":"41009","date":"2017-11-09","time":"7:00PM","awayTeam":{"ID":"24","City":"Edmonton","Name":"Oilers","Abbreviation":"EDM"},"homeTeam":{"ID":"7","City":"New Jersey","Name":"Devils","Abbreviation":"NJD"},"location":"Prudential Center"},"teamLineup":[{"team":{"ID":"24","City":"Edmonton","Name":"Oilers","Abbreviation":"EDM"},"expected":{"starter":[{"position":"Goalie-Backup","player":{"ID":"5552","LastName":"Brossoit","FirstName":"Laurent","JerseyNumber":"1","Position":"G"}},{"position":"ForwardLine1-RW","player":{"ID":"4854","LastName":"Maroon","FirstName":"Patrick","JerseyNumber":"19","Position":"LW"}},{"position":"ForwardLine2-C","player":{"ID":"4993","LastName":"Nugent-Hopkins","FirstName":"Ryan","JerseyNumber":"93","Position":"C"}},{"position":"ForwardLine4-C","player":{"ID":"4730","LastName":"Letestu","FirstName":"Mark","JerseyNumber":"55","Position":"C"}},{"position":"ForwardLine3-LW","player":{"ID":"11308","LastName":"Caggiula","FirstName":"Drake","JerseyNumber":"91","Position":"LW"}},{"position":"ForwardLine4-RW","player":{"ID":"5875","LastName":"Khaira","FirstName":"Jujhar","JerseyNumber":"16","Position":"LW"}},{"position":"ForwardLine3-C","player":{"ID":"3637","LastName":"Jokinen","FirstName":"Jussi","JerseyNumber":"36","Position":"LW"}},{"position":"ForwardLine3-RW","player":{"ID":"4997","LastName":"Strome","FirstName":"Ryan","JerseyNumber":"18","Position":"C"}},{"position":"ForwardLine1-C","player":{"ID":"5576","LastName":"McDavid","FirstName":"Connor","JerseyNumber":"97","Position":"C"}},{"position":"ForwardLine1-LW","player":{"ID":"5417","LastName":"Draisaitl","FirstName":"Leon","JerseyNumber":"29","Position":"C"}},{"position":"ForwardLine2-LW","player":
My Output is: [['DefensePair1-R', 'Klefbom', 'DefensePair1-R', 'Severson']]
I created for loops for home and away and also the players within them. There are 20 per Home and Away. But it is only returning just 18.. I thought the expression is 0-18 not only return 18 so there must be something else in the script that is poorly written.

for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data =[] ## HERE!
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Bring that out of the loop..
my_data =[] ## HERE!
for i in range(18):
homeplayer = team_home[i]['position']
awayplayer = team_away[i]['position']
homename = team_home[i]['player']['LastName']
awayname = team_away[i]['player']['LastName']
my_data.append([homeplayer, homename, awayplayer, awayname])
print(my_data)
Edit:
Found another snippet, possibly an error
for i in range(2):
team_away = data['gamestartinglineup']['teamLineup'][i]['expected']['starter']
In this snippet, you are overwriting the team_away. Not sure if you would like that

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Issue with Python - KeyError - python

Related

I have been trying to create a csv file from data recieved from a web scraper

Apply the code for smaller batches in the data set sequentially

KeyError while crawling websites for keyword and status

Export users with criteria - Python script

Python request returning only 1 element

Categories

Resources