Adding A Sentiment Analysis Loop When Collecting Twitter data - python

I am currently trying to add a sentiment analysis loop to a python script that collects tweets. When I run the script without the loop, it can generate the tweets just fine; however, whenever I add the for loop in ( starting at "for tweets in tweets returned", the tweets no longer generates and the csv I created does not appear as well. I was wondering if this had to do with where I have placed the for loop within the script or if there is some error with the loop itself. Any help would be greatly appreciated, thanks!
sentiments=[]
sentiment_means=[]
# Create URL Structure
class RequestWithMethod(urllib.request.Request):
def __init__(self, base_url, method, headers={}):
self._method = method
urllib.request.Request.__init__(self, base_url, headers)
def get_method(self):
if self._method:
return self._method
else:
return urllib.request.Request.get_method(self)
#Create Endpoint & Add Credentials
def create_rules_endpoint(query):
new_url = base_url + query
base64string = ('%s:%s' % (UN, PWD)).replace('\n', '')
base = base64.b64encode(base64string.encode('ascii'))
final_final_url = urllib.request.Request(new_url)
final_final_url.add_header('Authorization', 'Basic %s' % base.decode('ascii'))
return final_final_url
# Take in the Endpoint and Make the Request
def make_request(search_endpoint):
try:
response = urllib.request.urlopen(search_endpoint)
response_data = response.read()
handle_response(response_data)
except urllib.request.HTTPError as error:
print("ERROR: %s" % error)
# Handle the Returned Data
def handle_response(data):
tweets_returned = json.loads(data.decode('utf-8'))
print(tweets_returned)
**for tweet in tweets_returned['results']:
counter=1
compound_list=[]
positive_list = []
negative_list = []
neutral_list = []
geo_list = []
compound = analyzer.polarity_scores(tweet["text"])["compound"]
pos = analyzer.polarity_scores(tweet["text"])["pos"]
neu = analyzer.polarity_scores(tweet["text"])["neu"]
neg = analyzer.polarity_scores(tweet["text"])["neg"]
compound_list.append(compound)
positive_list.append(pos)
negative_list.append(neg)
neutral_list.append(neu)
sentiments.append({"Location": tweet["geo"],
"Date": tweet["created_at"],
"Tweet": tweet["text"],
"Compound": compound,
"Positive": pos,
"Neutral": neu,
"Negative": neg,
"Tweets_Ago": counter
})
counter+=1
sentiment_means.append({
"Compound_Mean": np.mean(compound_list),
"Positive": np.mean(positive_list),
"Neutral": np.mean(negative_list),
"Negative": np.mean(neutral_list),
"Count": len(compound_list)
})**
# Create the Endpoint Variable w/ Sample Query Keyword
search_endpoint = create_rules_endpoint('Wilson%20Rackets%20has%3Ageo%20lang%3Aen')
# Make the Request by Passing in Search Endpoint
make_request(search_endpoint)
# Convert all_sentiments to DataFrame
all_sentiments_pd = pd.DataFrame.from_dict(sentiments)
all_sentiments_pd.to_csv("sentiments_array_pd.csv")
display(all_sentiments_pd)
#print(all_sentiments_pd.dtypes)
# Convert sentiment_means to DataFrame
sentiment_means_pd = pd.DataFrame.from_dict(sentiment_means)
display(sentiment_means_pd)

Related

What is the best way to return a variable or call a function to maximize code reuse?

I was wondering if i could get some input from some season python exports, i have a couple questions
I am extracting data from an api request and calculating the total vulnerabilities,
what is the best way i can return this data so that i can call it in another function
what is the way i can add up all the vulnerabilities (right now its just adding it per 500 at a time, id like to do the sum of every vulnerability
def _request():
third_party_patching_filer = {
"asset": "asset.agentKey IS NOT NULL",
"vulnerability" : "vulnerability.categories NOT IN ['microsoft patch']"}
headers = _headers()
print(headers)
url1 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets"
resp = requests.post(url=url1, headers=headers, json=third_party_patching_filer, verify=False).json()
jsonData = resp
#print(jsonData)
has_next_cursor = False
nextKey = ""
if "cursor" in jsonData["metadata"]:
has_next_cursor = True
nextKey = jsonData["metadata"]["cursor"]
while has_next_cursor:
url2 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=500&cursor={nextKey}"
resp2 = requests.post(url=url2, headers=headers, json=third_party_patching_filer, verify=False).json()
cursor = resp2["metadata"]
print(cursor)
if "cursor" in cursor:
nextKey = cursor["cursor"]
print(f"next key {nextKey}")
#print(desktop_support)
for data in resp2["data"]:
for tags in data['tags']:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
if tags["name"] == 'OSSWIN':
print("OSSWIN")
critical_vuln_osswin = data['critical_vulnerabilities']
severe_vuln_osswin = data['severe_vulnerabilities']
modoer_vuln_osswin = data['moderate_vulnerabilities']
total_critical_vul_osswin.append(critical_vuln_osswin)
total_severe_vul_osswin.append(severe_vuln_osswin)
total_modoer_vuln_osswin.append(modoer_vuln_osswin)
print(sum(total_critical_vul_osswin))
print(sum(total_severe_vul_osswin))
print(sum(total_modoer_vuln_osswin))
if tags["name"] == 'DESKTOP_SUPPORT':
print("Desktop")
total_critical_vul_desktop = []
total_severe_vul_desktop = []
total_modorate_vuln_desktop = []
critical_vuln_desktop = data['critical_vulnerabilities']
severe_vuln_desktop = data['severe_vulnerabilities']
moderate_vuln_desktop = data['moderate_vulnerabilities']
total_critical_vul_desktop.append(critical_vuln_desktop)
total_severe_vul_desktop.append(severe_vuln_desktop)
total_modorate_vuln_desktop.append(moderate_vuln_desktop)
print(sum(total_critical_vul_desktop))
print(sum(total_severe_vul_desktop))
print(sum(total_modorate_vuln_desktop))
else:
pass
else:
has_next_cursor = False
If you have a lot of parameters to pass, consider using a dict to combine them. Then you can just return the dict and pass it along to the next function that needs that data. Another approach would be to create a class and either access the variables directly or have helper functions that do so. The latter is a cleaner solution vs a dict, since with a dict you have to quote every variable name, and with a class you can easily add additional functionally beyond just being a container for a bunch of instance variables.
If you want the total across all the data, you should put these initializations:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
before the while has_next_cursor loop (and similarly for the desktop totals). The way your code is currently, they are initialized each cursor (ie, each 500 samples based on the URL).

Integrating Progress Bar for API Call

Background: I have seen lots of examples of integrating a progress bar into a for loop, however nothing for my use case, and as such am looking for some advice.
For my use case, I am calling an API and testing if meta is in the response (meta = data I need). If meta is not in the API response, then the API returns a key pair value named percent_complete, which indicates the data I am trying to return is still aggregating, and provides a value on the progress of data aggregation.
Current code:
def api_call():
key, secret, url = ini_reader()
endpoint_url = endpoint_initializer()
while True:
response = requests.get(url = endpoint_url, auth = HTTPBasicAuth(key, secret), headers = {"vendor-firm": "111"})
api_response = json.loads(response.text)
if "meta" not in api_response:
id_value = "id"
res1 = [val[id_value] for key, val in api_response.items() if id_value in val]
id_value = "".join(res1)
percent_value = "percent_complete"
res2 = api_response["data"]["attributes"].get("percent_complete", '')*100
print(f' Your data request for: {id_value} is {res2}% complete!')
time.sleep(60)
elif "meta" in api_response:
return api_response
What I am trying to achieve: {res2} *100 gives the percentage, which I would like to use the measure of progress in a progress bar.
Can anyone suggest an appropriate dependency to use?
You can use the Enlighten library. You can keep your print statements and have multiple progress bars at the same time without making any other changes. Below is an example of how you might implement it.
Based on your example it looks like id_value changes, so I wrote the example like that. If it doesn't change you can just use it in the description. And if you have multiples, you'd probably want to create a progress bar for each. If you want to remove your progress bars after they complete, just add leave=False to manager.Counter().
The library is very customizable and the documentation has a lot of examples.
import enlighten
BAR_FORMAT = u'{id_value} {percentage:3.0f}%|{bar}| ' u'[{elapsed}<{eta}, {rate:.2f} %/s]'
manager = enlighten.get_manager()
def api_call():
pbar = manager.counter(total=100, bar_format=BAR_FORMAT)
...
while True:
...
if "meta" not in api_response:
...
pbar.count = res2
pbar.update(incr=0, id_value=id_value)
else:
...
pbar.count = 100
pbar.update(incr=0, id_value=id_value)
pbar.close()
return api_response
Thanks to Aviso, and for everyone's benefit, here is the completed function -
def api_call():
endpoint_url = endpoint_initializer()
key, secret, url = ini_reader()
BAR_FORMAT = u'{id_value} {percentage:3.0f}%|{bar}| ' u'[{elapsed}<{eta}, {rate:.2f} %/s]'
manager = enlighten.get_manager()
date = dt.datetime.today().strftime("%Y-%m-%d")
print("------------------------------------\n","API URL constructed for:", date, "\n------------------------------------")
print("-------------------------------------------------------------\n","Endpoint:", endpoint_url, "\n-------------------------------------------------------------")
pbar = manager.counter(total=100, bar_format=BAR_FORMAT)
while True:
response = requests.get(url = endpoint_url, auth = HTTPBasicAuth(key, secret), headers = {"vendor-firm": "381"})
api_response = json.loads(response.text)
if "meta" not in api_response:
id_value = "id"
res1 = [val[id_value] for key, val in api_response.items() if id_value in val]
id_value = "".join(res1)
percent_value = "percent_complete"
res2 = api_response["data"]["attributes"].get("percent_complete", '')*100
pbar.count = res2
pbar.update(incr=0, id_value=id_value)
time.sleep(60)
elif "meta" in api_response:
pbar.count = 100
pbar.update(incr=0, id_value=id_value)
pbar.close()
return api_response

Calling 2 functions within another function

I'm trying to call the extract function and the extract_url function within a function. I get name error: name 'endpoint' and name 'agg_key' is not defined. I'm doing this so I can call a script from another script so I don't need to run the command line. How would I go about doing this?
Function I'm trying to call:
def scrape_all_products(URL):
extract(endpoint, agg_key, page_range=None)
extract_url(args)
Functions I'm calling:
def extract(endpoint, agg_key, page_range=None):
r_list = list(range(page_range[0], page_range[1]+1)) if page_range else []
page = 1
agg_data = []
while True:
page_endpoint = endpoint + f'?page={str(page)}'
response = requests.get(page_endpoint, timeout=(
int(os.environ.get('REQUEST_TIMEOUT', 0)) or 10))
response.raise_for_status()
if response.url != page_endpoint: # to handle potential redirects
p_endpoint = urlparse(response.url) # parsed URL
endpoint = p_endpoint.scheme + '://' + p_endpoint.netloc + p_endpoint.path
if not response.headers['Content-Type'] == 'application/json; charset=utf-8':
raise Exception('Incorrect response content type')
data = response.json()
page_has_products = agg_key in data and len(
data[agg_key]) > 0
page_in_range = page in r_list or page_range is None
# break loop if empty or want first page
if not page_has_products or not page_in_range:
break
agg_data += data[agg_key]
page += 1
return agg_data
Other function:
def extract_url(args):
p = format_url(args.url, scheme='https', return_type='parse_result')
formatted_url = p.geturl()
agg_key = 'products'
if args.collections:
agg_key = 'collections'
fp = os.path.join(
args.dest_path, f'{p.netloc}.{agg_key}.{args.output_type}')
if args.file_path:
fp = os.path.join(
args.dest_path, f'{args.file_path}.{args.output_type}')
endpoint = f'{formatted_url}/{agg_key}.json'
ret = {
'endpoint_attempted': endpoint,
'collected_at': str(datetime.now()),
'success': False,
'error': ''
}
try:
data = extract(endpoint, agg_key, args.page_range)
except requests.exceptions.HTTPError as err:
ret['error'] = str(err)
except json.decoder.JSONDecodeError as err:
ret['error'] = str(err)
except Exception as err:
ret['error'] = str(err)
else:
ret['success'] = True
ret[agg_key] = data
if ret['success']:
ret['file_path'] = str(fp)
save_to_file(fp, data, args.output_type)
return ret
The scrape_all_products function only knows about variables created inside of that function and variables passed to it (which in this case is URL). endpoint and agg_key were both created inside of a different function. You have to pass those variables to scrape_all_products the same way you are passing URL. So do:
def scrape_all_products(URL, endpoint, agg_key, args):
And then you would have to appropriately modify anywhere scrape_all_products is called.

Updating access token (global variable) inside a function/loop problem

I'm extracting data using Spotify API wrapper. The access token (which is global variable) is valid for only 1 hour so I need to update it during the for loop in some defined function. I tried to update it using try/except, but I got the following error:
UnboundLocalError: local variable 'spotify' referenced before assignment.
Here is the relevant code:
token = credentials.get_access_token()
spotify = spotipy.Spotify(auth = token)
...
def main():
...
df_af = generate_audio_features_df(track_ids)
...
def generate_audio_features_df(track_ids):
col_list = ['id', 'danceability']
result = []
count = 0
for j in track_ids:
try:
r = spotify.audio_features(j)[0]
features_list = [r['id'], r['danceability']]
result.append(features_list)
#display progress
count += 1
print("Added ", count, " track")
except spotipy.client.SpotifyException:
token = credentials.get_access_token()
spotify = spotipy.Spotify(auth = token)
df = pd.DataFrame(data = result, columns = col_list)
return df
if __name__ == "__init__":
main()
I would like the code to update a token and get back to the loop.

Flickr API Function Issue

I am having issues with my below API request to Flickr. My function takes as input a list of 10 photo ids. However when I print the data from my function I am only getting information based on 1 photo ID. Looking at my below function any ideas on what may be causing the contents of only 1 photo ID to print? Any help would be great.
for item in get_flickr_data(word)["photos"]["photo"]:
photo_ids =item["id"].encode('utf-8')
lst_photo_ids.append(photo_ids)
print lst_photo_ids
lst_photo_ids = ['34117701526', '33347528313', '34158745075', '33315997274', '33315996984', '34028007021', '33315995844', '33347512113', '33315784134', '34024299271']
def get_photo_data(lst_photo_ids):
baseurl = "https://api.flickr.com/services/rest/"
params_d = {}
params_d["method"] = "flickr.photos.getInfo"
params_d["format"] = "json"
params_d["photo_id"] = photo_ids
params_d["api_key"] = FLICKR_KEY
unique_identifier = params_unique_combination(baseurl,params_d)
if unique_identifier in CACHE_DICTION:
flickr_data_diction = CACHE_DICTION[unique_identifier]
else:
resp = requests.get(baseurl,params_d)
json_result_text = resp.text[14:-1]
flickr_data_diction = json.loads(json_result_text)
CACHE_DICTION[unique_identifier] = flickr_data_diction
fileref = open(CACHE_FNAME,"w")
fileref.write(json.dumps(CACHE_DICTION))
fileref.close()
return flickr_data_diction
print get_photo_data(photo_ids)

Categories

Resources