I'm doing a loop for multiple requests on a web database, each time it takes one geneId to request it to the database. If the geneId is referenced I can use the data I get for another request on a second database. But if the geneId is not referenced, it gives me nothing back and it destroys my functions.
So I determined a None possibility, but in this case I'm getting a :
TypeError: 'NoneType' object is not iterable.
here is a part of my code with the None :
def getNewID(oneGeneIdAtOnce):
url = "http:/blablabla"
try :
cookieJar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
opener.addheaders = getHeaders()
resp = opener.open(url)
data = urllib.parse.urlencode(getPost(oneGeneIdAtOnce))
data = data.encode("UTF-8")
resp = opener.open(url, data)
respData = resp.read()
jobValue = getJobValueFromCookie(cookieJar)
downloadUrl = getUrlFromJobValue(jobValue)
resp = opener.open(downloadUrl)
respData = resp.read()
string = respData.decode("UTF-8")
if not string:
return None
l = string.split("\n")
lFinal = l[1].split("\t")
return lFinal[1]
except HTTPError as httpError:
print("HERE " + str(httpError))
except TypeError:
None
def searchGoFromDico(dictionary):
dicoGoForEachGroup = {}
for groupName in dico:
taxonAndGene = dico[groupName]
listeAllGoForOneGroup = []
for taxon in taxonAndGene:
geneIds = taxonAndGene[taxon]
for geneId in geneIds:
if geneId is not None:
listeGo = getGoID(getUniprotID(geneId))
listeAllGoForOneGroup.extend(listeGo)
dicoGoForEachGroup[groupName] = listeAllGoForOneGroup
return dicoGoForEachGroup
Any idea to allow my functions to work properly even if one of the geneId is None for the database ? Thank you for your upcoming answers.
You can use a try/except block around the code that's causing the issue. If the list is empty, run the next iteration of the outer for loop (untested):
def searchGoFromDico(dictionary):
dicoGoForEachGroup = {}
for groupName in dico:
taxonAndGene = dico[groupName]
listeAllGoForOneGroup = []
for taxon in taxonAndGene:
geneIds = taxonAndGene[taxon]
try:
for geneId in geneIds:
if geneId is not None:
listeGo = getGoID(getUniprotID(geneId))
listeAllGoForOneGroup.extend(listeGo)
except TypeError:
continue
dicoGoForEachGroup[groupName] = listeAllGoForOneGroup
return dicoGoForEachGroup
Related
Can't append seperate values from json data to lists. When trying to index them, getting this kind of error : 'TypeError: 'int' object is not subscriptable'
Without showing index, its just appends ALL of the data, which i dont want.
In this part i'am getting data:
import requests
import json
protein = []
fat = []
calories = []
sugar = []
def scrape_all_fruits():
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
In this part iam trying to append data and getting error i've mentioned before.
alist = json.dumps(scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
for value in i['nutritions'].values():
fat.append(value['fat'])
except KeyError:
pass
print(fat)
you iterate trough the values of nutritions. So it's not possible that there is a "fat" key. And why you iterate trough it? I mean theres no reason, just take the Key.
alist = json.dumps(scrape_all_fruits())
json_str = json.loads(alist)
for i in json_str:
try:
print(i['nutritions'])
fat.append(i['nutritions']['fat'])
except KeyError:
pass
print(fat)
This works. Tested on Python 3.8
I have a list with over 1000 IDs and I want to call an API with different endpoints for every element of the list.
Example:
customerlist = [803818, 803808, 803803,803738,803730]
I tried the following:
import json
import requests
import pandas as pd
API_BASEURL = "https://exampleurl.com/"
API_TOKEN = "abc"
HEADERS = {'content-type' : 'application/json',
'Authorization': API_TOKEN }
def get_data(endpoint):
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
res= pd.DataFrame([res])
return res
get_data(endpointexample)
This works, but it only returns the values for the first element of the list (803818). I want the function to return the values for every ID from customerlist for the endpoint I defined in the function argument.
I found this - possibly related - question, but I couldn't figure my problem out.
There is probably an easy solution for this which I am not seeing, as I am just starting with Python. Thanks.
The moment a function hits a return statement, it immediately finishes. Since your return statement is in the loop, the other iterations never actually get called.
To fix, you can create a list outside the loop, append to it every loop iteration, and then return the DataFrame created with that list:
def get_data(endpoint):
responses = []
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
responses.append(res)
return pd.DataFrame(responses)
A much cleaner solution would be to use list comprehension:
def get_data(endpoint, i):
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
return res
responses = pd.DataFrame([get_data(endpoint, i) for i in customerlist])
I'm trying too loop through all pages of api and get multiples json objects and store these as tuples within the list and return the final list
This works fine with only 1 object but I can't get it to work once i start adding multiple. I've tried various tweaks and changing for to while loops but can't seem to get to work
def star_wars_characters(url):
all_names1 = []
response1 = requests.get(url)
data1 = response1.json()
for x in data1['results']:
all_names1.append(x['name'])
while data1['next'] is not None:
response1 = requests.get(data1['next'])
data1 = response1.json()
for x in data1['results']:
all_names1.append(x['name'])
return all_names1
print(star_wars_characters("https://swapi.co/api/people/?page=1"))
I'm trying to achieve an output like below but for all pages. This is just results fro first page which I managed to return by changing for loops to while but couldn't get the remaining pages of data:
[('Luke Skywalker', '77'), ('C-3PO', '75'), ('R2-D2', '32'), ('Darth Vader', '136'), ('Leia Organa', '49'), ('Owen Lars', '120'), ('Beru Whitesun lars', '75'), ('R5-D4', '32'), ('Biggs Darklighter', '84'), ('Obi-Wan Kenobi', '77')]
import requests
def star_wars_characters(url):
return_data = []
response = requests.get(url)
data = response.json()
while True:
for result in data['results']:
return_data.append((result['name'], result['mass']))
if data['next'] is None:
break
response = requests.get(data['next'])
data = response.json()
return return_data
print(star_wars_characters("https://swapi.co/api/people/?page=1"))
def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(#class,"c") and contains(#id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[#class="nk"]/#href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[#class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failureļ¼',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
I try to grab every page of data and save it to data.But I wrote it wrong, because I saved only one piece of data on each page in 'a.txt'.So how do I write to save every page of data correctly in 'a.txt'?
Write operation is outside the for loop thats why it is only adding last iteration data to file
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
You're overwriting the various values in self.data in every iteration of the loop.
Instead, self.data should be a list. You should create a new dictionary in each iteration and append it to the data at the end.
self.data = []
for i in all_user:
values = {}
...
values['crawl_time'] = ...
values['user_id'] = ...
...
self.data.append(values)
I am beginner for Python,
How I can solve
AttributeError: module 'urllib' has no attribute 'Request'
As I view other post, still can't understand how solve the problem
Here the screen capture of the error
And this is the code (I refer from https://github.com/minimaxir/facebook-page-post-scraper/blob/master/get_fb_posts_fb_page.py)
import urllib.request
import json, datetime, csv, time
app_id = "xxx"
app_secret = "xxx" # DO NOT SHARE WITH ANYONE!
access_token = "xxx"
page_id = 'xxx'
def testFacebookPageData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.4"
node = "/" + page_id +'/feed'
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# retrieve data
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))
print (data)
def request_until_succeed(url):
req = urllib.request.urlopen(url)
success = False
while success is False:
try:
response = urllib.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print (e)
time.sleep(5)
print (url, datetime.datetime.now())
return response.read()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# construct the URL string
base = "https://graph.facebook.com"
node = "/" + page_id + "/feed"
parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
status_type = status['type']
status_link = '' if 'link' not in status.keys() else status['link']
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_likes, num_comments, num_shares)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print (num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
There is no urllib.Request() in Python 3 - there is urllib.request.Request().
EDIT: you have url = urllib.Request(url) in error message but I don't see this line in your code - maybe you run wrong file.