How can I make an exception for KeyError - python

I'm trying to make a program to uinterface with the Google CSE API, iterate over a list of people with companies, and pull LinkedIn profile information from the structured data. It then runs a regex to pull information from those and append them to a text file. Testing it works up to a point, but once I reach a certain name in the list, I get this error:
Traceback (most recent call last):
File "C:\Users\svillamil\Desktop\CSE2.py", line 27, in <module>
results = google_search("Gene Grochala Capital Health", my_api_key, my_cse_id, num=1)
File "C:\Users\svillamil\Desktop\CSE2.py", line 17, in google_search
return res['items']
KeyError: 'items'
Investigating it on the CSE show that the name and company yield no results. So, I put the made an exception for the KeyError.
except KeyError:
pass
This did not work, so I tried:
except Exception as e:
pass
and even:
except:
pass
with no luck.
Is there something wrong with my code that's stopping this error from passing? Or could it be an issue with the initial input?
Here is my program for reference:
from googleapiclient.discovery import build
import pprint
import csv
import re
import time
import os
os.chdir('C:\\users\\name\\Desktop')
my_api_key = "xxxxx"
my_cse_id = "xxxxx"
def google_search(search_term, api_key, cse_id, **kwargs):
service = build("customsearch","v1",developerKey=api_key)#initializes an instance of the custom search service with the build module.
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()#executes cse().list() on service to return metadata on the search performed, the specific CSE it is calling, and any other variable that might be added when calling the function as a whole
return res['items']
a = 0
with open('list.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
a+=1
name = row[1] + ' ' + row[2] + ' at ' +row[4]
print("This trial will search for", name)
results = google_search(name, my_api_key, my_cse_id, num=1)
try:
for result in results:
fn = r"fn':\s'(.+?)'"
pt = r"pagemap':.+'title.+?\s'(.*?)'"
role = r"role':\W+(.+?)'"
org = r"org\W+(.+?)'"
with open("cse_result.txt", "a+") as nameLookup:
if re.search(str(fn),str(result)) is not None:
name2 = re.search(str(fn),str(result)).group(1)
nameLookup.write("Trial "+str(a)+'\n')
nameLookup.write("The name being searched for in this trial is "+name+'.\n')
nameLookup.write("The name found is "+str(name2)+"\n")
nameLookup.write('\n')
else:
nameLookup.write("Trial "+str(a)+'\n')
nameLookup.write("We could not find a name on this trial."+'\n')
nameLookup.write('\n')
if re.search(str(pt),str(result)) is not None:
position_title = re.search(str(pt),str(result)).group(1)
nameLookup.write("The position found at this trial is " + position_title + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not find a position title at this trial.')
nameLookup.write('\n')
if re.search(str(role),str(result)) is not None:
role_title = re.search(str(role),str(result)).group(1)
nameLookup.write("The position found at this trial is " + role_title + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not return a position at this trial.')
nameLookup.write('\n')
if re.search(str(org),str(result)) is not None:
orginization = re.search(str(org),str(result)).group(1)
nameLookup.write("The orginization found at this trial is " + orginization + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not return an orginization at this trial.')
nameLookup.write('\n')
nameLookup.write('\n')
nameLookup.write('==========================')
nameLookup.write('\n')
except KeyError:
pass
#time.sleep(1)
This still yielded the same error
=======================================================
Here is an edited code some changes based on the comments
def google_search(search_term, api_key, cse_id, **kwargs):
service = build("customsearch","v1",developerKey=api_key)#initializes an instance of the custom search service with the build module.
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()#executes cse().list() on service to return metadata on the search performed, the specific CSE it is calling, and any other variable that might be added when calling the function as a whole
return res.get('items', [])
a = 0
def is_empty(any_structure):
if any_structure:
return False
else:
return True
with open('list.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
a+=1
name = row[1] + ' ' + row[2] + ' at ' +row[4]
print("This trial will search for", name)
results = google_search(name, my_api_key, my_cse_id, num=1)
for result in results:
fn = r"fn':\s'(.+?)'"
pt = r"pagemap':.+'title.+?\s'(.*?)'"
role = r"role':\W+(.+?)'"
org = r"org\W+(.+?)'"
with open("cse_result.txt", "a+") as nameLookup:
if is_empty(result)==True:
nameLookup.write('We could not return any data at this trial. Please see linkedin. This is trial '+a)
nameLookup.write('\n')
if re.search(str(fn),str(result)) is not None:
name2 = re.search(str(fn),str(result)).group(1)
nameLookup.write("Trial "+str(a)+'\n')
nameLookup.write("The name being searched for in this trial is "+name+'.\n')
nameLookup.write("The name found is "+str(name2)+"\n")
nameLookup.write('\n')
else:
nameLookup.write("Trial "+str(a)+'\n')
nameLookup.write("We could not find a name on this trial."+'\n')
nameLookup.write('\n')
if re.search(str(pt),str(result)) is not None:
position_title = re.search(str(pt),str(result)).group(1)
nameLookup.write("The position found at this trial is " + position_title + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not find a position title at this trial.')
nameLookup.write('\n')
if re.search(str(role),str(result)) is not None:
role_title = re.search(str(role),str(result)).group(1)
nameLookup.write("The position found at this trial is " + role_title + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not return a position at this trial.')
nameLookup.write('\n')
if re.search(str(org),str(result)) is not None:
orginization = re.search(str(org),str(result)).group(1)
nameLookup.write("The orginization found at this trial is " + orginization + '.\n')
nameLookup.write('\n')
else:
nameLookup.write('We could not return an orginization at this trial.')
nameLookup.write('\n')
nameLookup.write('\n')
nameLookup.write('==========================')
nameLookup.write('\n')
The problem now is that it is not appending the notice that there is no data found if the dictionary object is empty.

Related

Python Multiprocessing in web crawler

I am trying to implement multiprocessing in my web crawler, what I usually see online is sending the url as args into the function of map or map_async or apply_asyn. The data I am crawling is in the table, thus, I extract them by doing two times beautifulsoup find_all for row and column. Since the data I am crawling sometime is in one page which only require one url. I try to use the return list from Find_all as args for map_async, but the error occur showing "Fatal Python error: Cannot recover from stackoverflow."
The error occurred on the following line
return_list = pool.map_async(func, Species_all_recorded_data_List)
How could I solve it or where should the multiprocessing be implemented will be better?
The second problem is that if I put some code above the function crawl_all_data_mp, when it execute the pool = Pool(), all the code above will execute. I solved it by simply move all the other code under that function. It might not be correct since I still can't really run the code due to the first error.
Looking for your advice
My code:
(1) Function to call for web crawling
from tkinter import filedialog
from tkinter import *
import csv
import os.path
from os import path
from Index import *
from Dragonfly import *
import codecs
from multiprocessing import Process, Value
#\ multiprocessing ver
def multiprocessing_row_data(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page, Species_all_record_data_Data_Set):
global DataCNT, stop_crawl_all_data_mp
tmp_List = Species_all_record_data_Data_Set.find_all('td')
# End condition
# 1.no data in next page
# 2.for update to find unti the old data by inspecting its ID
# 3.if it count over the the limit count
id = tmp_List[0].text
if (len(id) == 0) or (DataCNT >= expecting_CNT)or (DataCNT >= Limit_CNT):
print(' --Finish crawl--' + ' crawl to page: ' + str(page) + ", ID: " + id + ", count: " + str(DataCNT))
stop_crawl_all_data_mp = True
raise StopIteration
# access the same value in memory when doing multiprocessing
with DataCNT.getlock():
DataCNT.value += 1
response_DetailedInfo = session.post(general_url + Detailed_discriptions_url + id, headers=headers)
soup2 = BeautifulSoup(response_DetailedInfo.text, 'html.parser')
print("Current finished datas >> " + str(DataCNT.value) + " /" + str(Total_num) + " (" + str(DataCNT.value * 100 / Total_num) + "%)", end='\r')
return DetailedTableInfo(tmp_List[0].text, tmp_List[1].text, tmp_List[2].text, tmp_List[3].text, tmp_List[4].text, tmp_List[5].text, tmp_List[7].text, tmp_List[6].text,
soup2.find(id='R_LAT').get('value'),
soup2.find(id='R_LNG').get('value'),
Web_rawl_Species_family_name,
Web_rawl_Species_name,
soup2.find(id='R_MEMO').get('value'))
def crawl_all_data_mp(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID):
page = 0
DataList = []
while not stop_crawl_all_data_mp:
pool = multiprocessing.Pool(10)
Species_all_recorded_data = session.post( general_url +
species_all_record_data_first_url +
species_all_record_data_page_url + str(page) +
species_all_record_data_species_url +
Species_class_key[Web_rawl_Species_family_name] +
Species_key[Web_rawl_Species_name],
headers=headers)
soup = BeautifulSoup(Species_all_recorded_data.text, 'html.parser')
Species_all_recorded_data_List = soup.find_all(id='theRow')
func = partial(multiprocessing_row_data, Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page)
return_list = pool.map_async(func, Species_all_recorded_data_List)
DataList.append(list(filter(None, return_list.get())))
page += 1
# make sure whe main is finished, subfunctions still keep rolling on
pool.close()
pool.join()
return [DataList, page]
(2) main
it goes wrong on the following line for calling the function above
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
the main code:
# --main--
if __name__ == '__main__':
# setting
Input_species_famliy = "細蟌科"
Input_species = "四斑細蟌"
limit_cnt = 6000
folder = 'Crawl_Data\\' + Species_class_key[Input_species_famliy]
File_name = folder + "\\" + Species_class_key[Input_species_famliy] + Species_key[Input_species] +'.csv'
oldID = 0
oldData_len = 0
print("--Start crawl-- " + Input_species_famliy + " " + Input_species)
print("[folder]: " + folder)
stop_crawl_all_data_mp = False
# check the file exist or not
file_check = path.exists(current_path + "\\" + File_name)
# get the Old ID
if file_check:
file_size = os.stat(current_path + "\\" + File_name).st_size
if not file_size == 0:
with open(File_name, newline='', errors = "ignore") as F:
R = csv.reader(F)
oldData = [line for line in R]
oldID = oldData[0][0]
oldData_len = len(oldData)-1
# login
Login_Web(myaccount, mypassword)
# find the total number of the species_input (expect executed one time)
Species_total_num_Dict = Find_species_total_data()
# get the data
Total_num = int(Species_total_num_Dict[Input_species])
#[datatmpList, page] = crawl_all_data(Input_species_famliy, Input_species, Total_num, limit_cnt, oldID)
expecting_CNT = Total_num - oldData_len # get the total number of data need to be update ot crawl
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
Data = []
for Data_tmp in datatmpList:
Data.append([Data_tmp.SpeciesFamily,
Data_tmp.Species,
Data_tmp.IdNumber,
Data_tmp.Dates,
Data_tmp.Times,
Data_tmp.User,
Data_tmp.City,
Data_tmp.Dictrict,
Data_tmp.Place,
Data_tmp.Altitude,
Data_tmp.Latitude,
Data_tmp.Longitude,
Data_tmp.Description
])
#auto make the directories
newDir = current_path + "\\" + folder
if (not os.path.isdir(newDir)):
os.mkdir(newDir)
# 'a' stands for append, which can append the new data to old one
with open(File_name, mode='a', newline='', errors = "ignore") as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
# init , for there is no file exists or the file is empty
if ((not file_check) or (file_size == 0)):
employee_writer.writerow(CSV_Head)
employee_writer.writerows(Data)
# for inserting the data into the old one
else:
for i in range(0, len(Data)):
oldData.insert(i, Data[i])
employee_writer.writerows(oldData)

Access list of values from a function in another function in Python?

I have a function that gathers a list of UPCs. I created another function to take the list and search for prices. The issue I am having is when a UPC is not found, a KeyError occurs. How do ignore the UPCs with no match and continue with the code? The current version of the code is an infinite loop.
def trending():
trending = requests.get('http://api.com/trends?format=json&apiKey={}'.format(apiKey))
trendingResponse = trending.json()
items = trendingResponse['items']
for item in items:
price = item['salePrice']
name = item['name']
upc = item['upc']
stock = item['stock']
image = item['largeImage']
url = item['productUrl']
sDescription = item['shortDescription']
brandName = item['brandName']
availableOnline = item['availableOnline']
print('Current UPC = ' + str(upc))
return upc_lookup(upc)
def upc_lookup(upc):
products_api = mws.Products(access_key, secret_key, seller_id, region='US')
# lookup product by upc
products = products_api.get_matching_product_for_id(marketplaceid=marketplace_usa, type_='UPC', ids=upc)
parse = products.parsed
while True:
try:
# return asin from UPC lookup
asin = parse['Products']['Product']['Identifiers']['MarketplaceASIN']['ASIN']['value']
print('ASIN Found = ' + str(asin))
except KeyError:
print('UPC {} not Found in Amazon'.format(upc))
Looks like I had to move my Return in the first function out of the For Loop.
def function_1():
function_1.item = {'sale-Price':100, 'name':'ABC', 'stock':3, 'brand-name':4}`
def function_2():
item = function_1.item
sp = item['salePrice']
name = item['name']
stock = item['stock']
print(sp, name, stock)
function_1()
function_2()

Trouble opening up a twitter text file in python

I gathered a bunch of tweets for analysis with python. But upon trying to open up the text extension file I received this error message. I don't know if maybe something is wrong the schema of the tweets that I collected.
JSONDecodeError: Extra data: line 2 column 1 (char 12025)
Here is the code that I compiled:
with open ('tweets1.json') as dakota_file:
dakota_j=json.loads(dakota_file.read())
Please see code:
import sys
import jsonpickle
import os
searchQuery = '#Dakota-Access-Pipeline' # this is what we're searching for
#maxTweets = 10000000 # Some arbitrary large number
maxTweets=6000
tweetsPerQry = 100 # this is the max the API permits
#fName = 'tweets.txt' # We'll store the tweets in a text file.
fName='tweets.json'
# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -10000000
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
'\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

Scrape facebook AttributeError

I am beginner for Python,
How I can solve
AttributeError: module 'urllib' has no attribute 'Request'
As I view other post, still can't understand how solve the problem
Here the screen capture of the error
And this is the code (I refer from https://github.com/minimaxir/facebook-page-post-scraper/blob/master/get_fb_posts_fb_page.py)
import urllib.request
import json, datetime, csv, time
app_id = "xxx"
app_secret = "xxx" # DO NOT SHARE WITH ANYONE!
access_token = "xxx"
page_id = 'xxx'
def testFacebookPageData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.4"
node = "/" + page_id +'/feed'
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# retrieve data
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))
print (data)
def request_until_succeed(url):
req = urllib.request.urlopen(url)
success = False
while success is False:
try:
response = urllib.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print (e)
time.sleep(5)
print (url, datetime.datetime.now())
return response.read()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# construct the URL string
base = "https://graph.facebook.com"
node = "/" + page_id + "/feed"
parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
status_type = status['type']
status_link = '' if 'link' not in status.keys() else status['link']
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_likes, num_comments, num_shares)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print (num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
There is no urllib.Request() in Python 3 - there is urllib.request.Request().
EDIT: you have url = urllib.Request(url) in error message but I don't see this line in your code - maybe you run wrong file.

Youtube API Handling Deleted video error

I have written code to get playlist and the video lists within them in different text files:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
YouTube Playlist Extrator.
A tool to extract playlists from YouTube API which in todays YouTube page format is very difficult to extract.
It also extracts video lists per playlist and hence takes bit longer to run for long playlists.
"""
#from profiler import Profiler
from xml.dom.minidom import parseString
import os
try:
import urllib.request as urlLibReq
PY3 = True
except:
import urllib as urlLibReq
PY3 = False
def getInput():
if PY3:
return input("Enter username of YouTube channel: ")
elif not PY3:
return raw_input("Enter username of YouTube channel: ")
def xmlParser(url):
page = urlLibReq.urlopen(url)
text = page.read().decode("utf8")
return parseString(text)
def extractplaylist(userId):
url = "https://gdata.youtube.com/feeds/api/users/"+ userId +"/playlists?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
listEntry.sort()
writer = open(userId+"_playlist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
def extractvideolist(userId, playlist_id, playlist_title):
url = "http://gdata.youtube.com/feeds/api/playlists/"+ playlist_id +"?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
video_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
listEntry.append(str(video_title))
startIndex += 1
playlist_title = playlist_title.replace("'","\'")
writer = open(playlist_title+"_videolist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
print("written", playlist_title)
try: os.mkdir(userId)
except: pass
os.system('mv "'+ playlist_title +'_videolist.txt" '+ userId)
if __name__ == "__main__":
name = getInput()
extractplaylist(name)
#Profiler.report()
The code fails when there is a deleted video in the playlist. How do I deal with such a thing?
Try adding an else clause to your for loop to break out of the while loop when the for loop ends.
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
else:
break

Categories

Resources