can't concat string to bytes - python

I'm trying grab twitter user data by their screen name using python.
The entire script does is to loop over each of the Twitter accounts in the ids variable — and for each one it will grab its profile information and add that to a row of the output file.
but I'm getting an error.
This is my code
// LIST OF TWITTER USER IDS
ids = "4816,9715012,13023422, 13393052, 14226882, 14235041, 14292458, 14335586, 14730894,\
15029174, 15474846, 15634728, 15689319, 15782399, 15946841, 16116519, 16148677, 16223542,\
16315120, 16566133, 16686673, 16801671, 41900627, 42645839, 42731742, 44157002, 44988185,\
48073289, 48827616, 49702654, 50310311, 50361094,"
// THE VARIABLE USERS IS A JSON FILE WITH DATA ON THE 32 TWITTER USERS LISTED ABOVE
users = t.lookup_user(user_id = ids)
//NAME OUR OUTPUT FILE - %i WILL BE REPLACED BY CURRENT MONTH, DAY, AND YEAR
outfn = "twitter_user_data_%i.%i.%i.txt" % (now.month, now.day, now.year)
// NAMES FOR HEADER ROW IN OUTPUT FILE
fields = "id screen_name name created_at url followers_count friends_count statuses_count \
favourites_count listed_count \
contributors_enabled description protected location lang expanded_url".split()
// INITIALIZE OUTPUT FILE AND WRITE HEADER ROW
outfp = open(outfn, "w")
//outfp.write(string.join(fields, "\t") + "\n") # header
outfp.write("\t".join(fields) + "\n") # header
// THIS BLOCK WILL LOOP OVER EACH OF THESE IDS, CREATE VARIABLES, AND OUTPUT TO FILE
for entry in users:
// CREATE EMPTY DICTIONARY
r = {}
for f in fields:
r[f] = ""
// ASSIGN VALUE OF 'ID' FIELD IN JSON TO 'ID' FIELD IN OUR DICTIONARY
r['id'] = entry['id']
// SAME WITH 'SCREEN_NAME' HERE, AND FOR REST OF THE VARIABLES
r['screen_name'] = entry['screen_name']
r['name'] = entry['name']
r['created_at'] = entry['created_at']
r['url'] = entry['url']
r['followers_count'] = entry['followers_count']
r['friends_count'] = entry['friends_count']
r['statuses_count'] = entry['statuses_count']
r['favourites_count'] = entry['favourites_count']
r['listed_count'] = entry['listed_count']
r['contributors_enabled'] = entry['contributors_enabled']
r['description'] = entry['description']
r['protected'] = entry['protected']
r['location'] = entry['location']
r['lang'] = entry['lang']
// NOT EVERY ID WILL HAVE A 'URL' KEY, SO CHECK FOR ITS EXISTENCE WITH IF CLAUSE
if 'url' in entry['entities']:
r['expanded_url'] = entry['entities']['url']['urls'][0]['expanded_url']
else:
r['expanded_url'] = ''
print(r)
// CREATE EMPTY LIST
lst = []
// ADD DATA FOR EACH VARIABLE
for f in fields:
lst.append(str(r[f]).replace("\/", "/"))
// WRITE ROW WITH DATA IN LIST
//outfp.write(string.join(lst, "\t").encode("utf-8") + "\n")
outfp.write("\t".join(lst).encode('utf-8') + '\n')
outfp.close()
The error message
TypeError Traceback (most recent call last)
<ipython-input-54-441137b1bb4d> in <module>()
37 #WRITE ROW WITH DATA IN LIST
38 #outfp.write(string.join(lst, "\t").encode("utf-8") + "\n")
---> 39 outfp.write("\t".join(lst).encode('utf-8') + '\n')
40
41 outfp.close()
TypeError: can't concat str to bytes
Any idea on how to fix this? The version of Python is 3.6.5
Your help will be greatly appreciated. Thanks.
Edite:
This screenshot of part of my file after I opened the output file in the binary mode

outfp.write("\t".join(lst).encode('utf-8') + '\n')
After you do .encode() on a string you get an instance of bytes. You can't add another string (like \n) to bytes. That's what the error is telling you.
So you need to add the \n before you encode the string. Like below:
outfp.write(("\t".join(lst) + '\n').encode('utf-8'))

Related

AttributeError: 'Document' object has no attribute 'dfs'

I attached the full error that I am receiving below and want to know what is causing this.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-154-d356d4172c8c> in <module>
23 save_dictionary(doc.tfs, "dfs_" + str(doc.id) + ".txt")
24
---> 25 vectorize("./textfiles")
26
<ipython-input-154-d356d4172c8c> in vectorize(data_path)
14 Documents.append(doc)
15
---> 16 save_dictionary(doc.dfs, "tf_" + str(doc.id) + ".txt")
17
18 dfs = {}
AttributeError: 'Document' object has no attribute 'dfs'
I also have attached the full code to my program to help troubleshoot my issue. I am not to sure what I would do would I need to do to fix this issue. If I change doc.dfs to doc.tfs it works but since dfs is a dictionary I don't think it works.
class Document:
def __init__(self, doc_id):
# create a new document with its ID
self.id = doc_id
# create an empty dictionary
# that will hold the term frequency (TF) counts
self.tfs = {}
def tokenization(self, text):
# split a title into words,
# using space " " as delimiter
words = text.lower().split(" ")
for word in words:
# for each word in the list
if word in self.tfs:
# if it has been counted in the TF dictionary
# add 1 to the count
self.tfs[word] = self.tfs[word] + 1
else:
# if it has not been counted,
# initialize its TF with 1
self.tfs[word] = 1
def save_dictionary(diction_data, file_path_name):
f = open(file_path_name, "w+")
for key in diction_data:
# Separate the key from the frequency with a space and
# add a newline to the end of each key value pair
f.write(key + " " + str(diction_data[key]) + "\n")
f.close()
def vectorize(data_path):
Documents = []
for i in range(1, 21):
file_name = "./textfiles/"+ str(i) + ".txt"
# create a new document with an ID
doc = Document(i+1)
#Read the files
with open(file_name,'r') as f:
text = f.read()
#compute the term frequencies
#read in the files contents
doc.tokenization(text)
# add the documents to the lists
Documents.append(doc)
save_dictionary(doc.tfs, "tf_" + str(doc.id) + ".txt")
DFS = {}
for doc in Documents:
for word in doc.tfs:
DFS[word] = DFS.get(word,0) + 1
save_dictionary(doc.DFS, "DFS_" + str(doc.id) + ".txt")
vectorize("./textfiles")
I have looked at the class and I think the issue is coming from there. I am unsure what exactly is causing this problem but I would want whatever words not in tfs to be saved to dfs and then when I run it have files with .tfs and also .dfs files.

python loop for calling API

I need to get Geo data for a bunch of IPs (eventually I will need data for 3k+ IPs). I was able to successfully get Geo data for individual IPs. Now I'm truing to create a loop which iterates through IPs stored as separate lines in a text file and then calls ipstack API for getting Geo data. But the code returns data only for last IP in the file with 'missing_access_key' error for the other ones. I'm a python beginner - so any help would be appreciated.
fh = open('IPs.txt')
for line in fh:
ip = line
api = 'http://api.ipstack.com/' + ip + '?access_key=' + access_key
result = urllib.request.urlopen(api).read()
result = result.decode()
result = json.loads(result)
print (result)
fh = open('IPs.txt,'r')
Lines = fh.readlines()
for line in Lines:
ip = line
api = 'http://api.ipstack.com/' + ip + '?access_key=' + access_key
result = urllib.request.urlopen(api).read()
result = result.decode()
result = json.loads(result)
print (result)

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Python 2.7, arcpy: import data from txt file into empty attribute table.

I have a shapefile (Fireincidents) that currently does not have any data. I want to populate the feature with data from a text file (north_america_fires). The text file contains the lat, long, and confidence of each fire. I've created a new insert cursor to insert new roles. I've also started a for loop to loop through each of the rows in the text file. I am having trouble identifying a method to populate the rows in the point class object. I believe I have to create a list that the data from the text file can be appended to then somehow insert that list of data into the attribute table.
My current code:
try:
work = raw_input("Enter the full path of WildlandFires.mdb: ")
arcpy.env.workspace = work
arcpy.env.overwriteOutput = True
iFile = raw_input("Enter the full path of wildfire text file: ")
fields = ["SHAPE#", "CONFIDENCEVALUE"]
cur = arcpy.da.InsertCursor("FireIncidents", fields)
f = open(iFile, 'r')
lstFires = f.readlines()
cntr = 0
for fire in lstFires:
if 'Latitude' in fire:
continue
row = line.split(',')
lstValues = []
latitude = row[0].strip()
longitude = row[1].strip()
confid = row[2].strip()
pnt = arcpy.CreateObject("Point")
lstValues.append(pnt)
f.close()
except Exception as e:
print "Error: " + str(e)
print arcpy.GetMessages()
arcpy.AddError(e)
Any guidance would be appreciated.
I just identified the issue. It does have correct lat and long coordinates. Thank you for responding.

having trouble converting dictionary elements (string) into int

Having a little trouble converting the two elements that are in a tuple inside of a dictionary into int values. the keys of the dictionary are country name and the tuple of info is (the area, the population). This is what i have so far :
def _demo_fileopenbox():
msg = "Pick A File!"
msg2 = "Select a country to learn more about!"
title = "Open files"
default="*.py"
f = fileopenbox(msg,title,default=default)
writeln("You chose to open file: %s" % f)
countries = {}
with open(f,'r') as handle:
reader = csv.reader(handle, delimiter = '\t')
for row in reader:
countries[row[0]] = (row[1].replace(',', ''), row[2].replace(',', ''))
for i in countries:
int((countries[i])[0])
int((countries[i])[1])
#while 1:
# reply = choicebox(msg=msg2, choices= list(countries.keys()) )
# writeln(reply + "-\tArea: " + (countries[reply])[0] + "\tPopulation: " + (countries[reply])[1] )
but i keep getting this error :
int((countries[i])[0])
ValueError: invalid literal for int() with base 10: ''
any ideas how to fix this or a better way to do this:

Categories

Resources