Need assistance with dictionaries, csv files, and lists - python

Alright, so I need a code that will take a csv file and reads the values in it (so far I've gotten that part down).
What I'm having trouble with is creating a list with those values, and ordering them in order of less re-occurring to most re-occurring. There can be no duplicate values either.
Here's what I have:
import csv
B = []
K = []
def readandprocess(name):
with open(name, newline='') as csvf:
freader = csv.reader(csvf,delimiter=',',quotechar='"')
datasg = {}
artists = []
for row in freader:
artist = row[2]
B.append(artist)
for artist in B:
c = B.count(artist)
K.append(artist + str(c))
list(set(K))
print(K)
#for row in freader:
#artist = row[2]
###song = row[1]
#if artist == 'Rolling Stones':
# print('Rolling Stones title: ',row[1])
#if artist not in datasg:
# datasg[artist] = [song]
#else:
#datasg[artist].append(song)
#for artist in datasg:
#print(artist, datasg[artist])
print( '--------------------------------------')
info = datasg.items()
# tosort = [(len(t[1]),t[0]) for t in info]
# info = sorted(tosort)
# print(info[-30:])
# print(info)
print(len(datasg)) # currently 0, populate at will #Number of keys in dictionary
return datasg
if __name__ == '__main__':
datasg = readandprocess('data/top1000.csv')

Try using Counter. Once you have all the items you need in a list, you can use a Counter, and then call most_common(n) to get the n most common elements.

Related

Total population of countries

Write a function named "total_population" that takes a string then a list as parameters where the string represents the name of a CSV file containing city data in the format "CountryCode, CityName, Region, Population, Latitude, Longitude" and the second parameter is a list where each element is itself a list containing 3 strings as elements representing the CountryCode, CityName, and Region in this order. Return the total population of all cities in the list. Note that the city must match the country, name, and region to ensure that the correct city is being read.
I have pretty much everything setup nicely(I think) but I have a problem trying to sum the population at the end. I tried 3 ways of adding +1 each time and adding everything at the end but I can't seem to get it right.
import csv
def total_population(filename, cityinfo): # have CSV file and list that is a line with the function
totalPop = 0
#count = 0
for str3 in cityinfo: # rep. the three catagorize
countryCode = str3[0]
cityName = str3[1]
region = (str3[2])
with open (filename, newline='') as f: # the list contains 3 strings(Country code, city name, region)
readCsv = csv.reader(f)
for line in readCsv:
if (line[0] == countryCode):
if (line[1] == cityName):
if ((line[2]) == region):
#count += 1
totalPop = totalPop + int(line[3])
#totalPop += int(line[3])
return totalPop
The error message that I kept getting when submitting my code.
returned: 19561
expected: 25187
you just need to put the with/as statement in the for loop, so that it does it for every element in cityinfo, because it is not just one array. It is multiple, so you are only getting the first one
import csv
def total_population(filename, cityinfo):
totalPop = 0
#count = 0
for str3 in cityinfo:
countryCode = str3[0]
cityName = str3[1]
region = (str3[2])
with open (filename, newline='') as f:
readCsv = csv.reader(f)
for line in readCsv:
if (line[0] == countryCode):
if (line[1] == cityName):
if ((line[2]) == region):
totalPop = totalPop + int(line[3])
return totalPop

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

There's no value in my output file

my file contains "Name" and 5 eye movement values (TFF, TFD, TVD, FB, FC). I want to sum up each eye movement values if the rows under Name column are the same. It seems like the code is working, there's no error happened, but my output files stayed empty. Could anyone give me some pointers where went wrong? Here's the code:
import csv
file = open("P01_All.csv", "r") #Open CSV File in Read Mode
reader = csv.reader(file) #Create reader object which iterates over lines
outfile = open("Name.csv","w")
outfile2 = open("TFF.csv","w")
outfile3 = open("TFD.csv","w")
outfile4 = open("TVD.csv","w")
outfile5 = open("FB.csv","w")
outfile6 = open("FC.csv","w")
class Object: #Object to store unique data
def __init__(self, Name, TFF, TFD, TVD, FB, FC):
self.Name = Name
self.TFF = TFF
self.TFD = TFD
self.TVD = TVD
self.FB = FB
self.FC = FC
rownum = 0 #Row Number currently iterating over
list = [] #List to store objects
def checkList(Name, TFF, TFD, TVD, FB, FC):
for object in list: #Iterate through list
if object.Name == Name:
object.TFF += float(TFF)
object.TFD += float(TFD)
object.TVD += float(TVD)
object.FB += float(FB)
object.FC += float(FC)
return
newObject = Object(Name, float(TFF),float(TFD), float(TVD), float(FB), float(FC)) #Create a new object with new eye and TFF
list.append(newObject) #Add to list and break out
for row in reader: #Iterate through all the rows
if rownum == 0: #Store header row seperately to not get confused
header = row
else:
Name = row[0]
TFF = row[1]
TFD = row[2]
TVD = row[3]
FB = row[4]
FC = row[5]
if len(list) == 0: #Default case if list = 0
newObject = Object(Name, float(TFF),float(TFD), float(TVD), float(FB), float(FC))
list.append(newObject)
else: #If not...
checkList(Name, TFF, TFD, TVD, FB, FC)
rownum += 1
for each in list: #Print out result
# print(each.Name, each.TFF, each.TFD, each.TVD, each.FB, each.FC)
outfile.write(each.Name + "\n" )
outfile2.write(str(each.TFF)+ "\n" )
outfile3.write(str(each.TFD)+ "\n" )
outfile4.write(str(each.TVD)+ "\n" )
outfile5.write(str(each.FB)+ "\n" )
outfile6.write(str(each.FC)+ "\n" )
file.close() #Close file
outfile.close()
outfile2.close()
outfile3.close()
outfile4.close()
outfile5.close()
outfile6.close()
Like #zwer said, the reason why you have nothing in your output file is because you don't increment rownum while you are iterating the rows from your input file. By indenting the line rownum += 1 you put it inside your loop where you read each row. So with minimal modification it would look
import csv
file = open("P01_All.csv", "r") #Open CSV File in Read Mode
reader = csv.reader(file) #Create reader object which iterates over lines
outfile = open("Name.csv","w")
outfile2 = open("TFF.csv","w")
outfile3 = open("TFD.csv","w")
outfile4 = open("TVD.csv","w")
outfile5 = open("FB.csv","w")
outfile6 = open("FC.csv","w")
class Movement_value: #Object to store unique data
def __init__(self, Name, TFF, TFD, TVD, FB, FC):
self.Name = Name
self.TFF = TFF
self.TFD = TFD
self.TVD = TVD
self.FB = FB
self.FC = FC
rownum = 0 #Row Number currently iterating over
notebook = [] #List to store objects
def checkList(Name, TFF, TFD, TVD, FB, FC):
for value in notebook: #Iterate through list
if value.Name == Name:
value.TFF += float(TFF)
value.TFD += float(TFD)
value.TVD += float(TVD)
value.FB += float(FB)
value.FC += float(FC)
return
newObject = Movement_value(Name, float(TFF),float(TFD), float(TVD), float(FB), float(FC)) #Create a new object with new eye and TFF
notebook.append(newObject) #Add to list and break out
for row in reader: #Iterate through all the rows
if rownum == 0: #Store header row seperately to not get confused
header = row
else:
Name = row[0]
TFF = row[1]
TFD = row[2]
TVD = row[3]
FB = row[4]
FC = row[5]
if len(notebook) == 0: #Default case if list = 0
newObject = Movement_value(Name, float(TFF),float(TFD), float(TVD), float(FB), float(FC))
notebook.append(newObject)
else: #If not...
checkList(Name, TFF, TFD, TVD, FB, FC)
rownum += 1
for each in notebook: #Print out result
# print(each.Name, each.TFF, each.TFD, each.TVD, each.FB, each.FC)
outfile.write(each.Name + "\n" )
outfile2.write(str(each.TFF)+ "\n" )
outfile3.write(str(each.TFD)+ "\n" )
outfile4.write(str(each.TVD)+ "\n" )
outfile5.write(str(each.FB)+ "\n" )
outfile6.write(str(each.FC)+ "\n" )
file.close() #Close file
outfile.close()
outfile2.close()
outfile3.close()
outfile4.close()
outfile5.close()
outfile6.close()
I have made some additional change: It's better that you don't use list or object as variable names because they are already used in Python and by doing so you'll override their meaning. You could have a bad surprise eventually.
But we can do more.
We don't need to create a class to hold the values
We can work with files using context managers to make sure that our file is not kept open for not relevant reasons.
Here's a version that is shorter than yours:
import csv
import pathlib
input_filepath = pathlib.Path("Input.csv")
output_filepath = pathlib.Path("")
with open(input_filepath, newline="") as input_file:
# Where our data will be kept
input_data = {}
csv_reader = csv.reader(input_file)
# Skip the first line
next(csv_reader)
for (Name, *rest_of_data) in csv_reader:
if Name in input_data:
for (index_of_data_to_update, data_to_update) in enumerate(rest_of_data):
input_data[Name][index_of_data_to_update] += float(data_to_update)
else:
input_data[Name] = [float(x) for x in rest_of_data]
output_rows = ([name] + list(data) for (name, data) in input_data.items())
output_filenames = [
"Name.csv",
"TFF.csv",
"TFD.csv",
"TVD.csv",
"FB.csv",
"FC.csv"
]
output_files = [open(output_filepath / filename, "w") for filename in output_filenames]
# Open all the files
with output_files[0], output_files[1], output_files[2], output_files[3], \
output_files[4], output_files[5]:
for row in output_rows:
for (output_file, data) in zip(output_files, row):
output_file.write("{}\n".format(data))

can not get proper output when i write in .csv file...the comma separated text is being written in multiple columns,i want only two columns

def classify(self):
for i in self.tweets:
tw = self.tweets[i]
count = 0
res = {}
for t in tw:
label = self.classifier.classify(self.helper.extract_features(t.split()))
if(label == 'positive'):
self.pos_count[i] += 1
elif(label == 'negative'):
self.neg_count[i] += 1
elif(label == 'neutral'):
self.neut_count[i] += 1
result = {'text': t, 'tweet': self.origTweets[i][count], 'label': label}
res[count] = result
count += 1
#end inner loop
self.results[i] = res
def writeOutput(self,filename, writeOption='wb'):
fp = open(filename, writeOption)
if(fp):
for i in self.results:
res = self.results[i]
for j in res:[enter image description here][1]
item = res[j]
text = item['text'].strip()
label = item['label']
writeStr = label+" , "+text+"\n"
pickle.dump(writeStr,fp)
I am writing 'label' and 'text' in csv "filename", but the text is getting divided in multiple columns which causing further error in my work.
You need to use Python's CSV library. This takes a list of values and writes it to a file, automatically adding suitable delimiters and quoting to make it a proper comma separated file. In your case, your text contains commas, as such these items will automatically have " correctly added around them. This can be done roughly as follows:
import csv
def writeOutput(self, filename, writeOption='wb'):
with open(filename, writeOption) as f_output:
csv_output = csv.writer(f_output)
for i in self.results:
res = self.results[i]
for j in res:
item = res[j]
csv_output.writerow([item['label'], item['text'].strip()])

CSV not working if I put some extra space. Getting list index out of range error

I am new in python and I am trying to getting CSV data using python code.
Every thing is working first time,But when I edit my .csv file then an error occured says:
File "D:/wamp/www/optimizer_new/new_code/optimal_lineup.py", line 310, in get_player_list
if (int(row[4]) == -1):
IndexError: list index out of range
I am just putting a extra space inside my .csv
here is my sample code:
def get_player_list(possible_name):
file_name = ""
if (len(possible_name) > 0):
file_name = possible_name
else:
file_name = 'basketball_data2.csv'
player_list = []
with open(file_name) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
reader.next()
for row in reader:
if (int(row[4]) == -1):
#print("Skipping %s" % (row[0]))
continue
name = row[0]
pos_p = get_possible_positions(row[1])
c = row[2]
v = row[3]
my_p = player(int(c) / 100, float(v), name, pos_p, int(row[4]))
player_list.append(my_p)
'''
name = row['Player Name']
c = row['Salary']
v = row['FP']
pos_p = get_possible_positions(row['Pos'])
player_list.append(player(c, v, name, pos_p))
'''
return player_list
My CSV contain these columns:
Player Name,Pos,Salary,FP,Keep/exclude
Any suggestion?

Categories

Resources