I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.
Related
I'm making a script that fills a text document with responses from an api. The api is being asked to convert usernames from a list to universally unique identifiers. I keep getting this error and can't find a way around it. "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)"
Sample of accounts.txt
knapplace
Coppinator
tynow
Pman59
ButterMusty
FlyHighGuy13
Seyashi
fluzzygirl1
SquidMan55
leonrules9
BarthGimble
MTR_30
Darkshadow402
Deathmyster
Team_Everlook
Sheathok
KCFrost
mendog
Allfaal117
theLP25D
Zimyx
Blurrnis
redboy678
moose_breeder
kaser12345
import requests
import json
file1 = open('accounts.txt', 'r')
usernames = []
for line in file1:
stripped_line = line.strip()
usernames.append(stripped_line)
file1.close()
for x in usernames:
username = str(x)
url = ("https://api.mojang.com/users/profiles/minecraft/"+username+"?at=1462770000")
y = requests.get(url)
y_data = y.json()
uuid = y_data['id']
uuids = []
uuids.append(uuid)
file2 = open('uuids.txt', 'w')
file2.writelines(uuids)
file2.close()
file2 = open('uuids.txt', 'r')
lines = file2.readlines()
Note: #Ali makes a great point about checking for an empty reply. With that fix it works like a champ for me with a few other minor changes:
Used usernames provided by OP instead of reading them in from a file.
Moved initialization of uuids out of for loop to avoid it being reset for each username.
Modfied file i/o stuff to what I am more used to working with. ;^)
import requests
import json
usernames = [
"knapplace",
"Coppinator",
"tynow",
]
uuids = []
for x in usernames:
username = str(x)
url = ("https://api.mojang.com/users/profiles/minecraft/"+username+"?at=1462770000")
y = requests.get(url)
if len(y.content) == 0:
continue # Skip processing this username
y_data = y.json()
uuid = y_data['id']
uuids.append(uuid)
with open('uuids.txt', 'w') as f:
for uuid in uuids:
f.write(uuid + '\n')
with open('uuids.txt', 'r') as f:
read_data = f.read()
print(read_data)
Output:
c9998bafea3146d5935f4e215b6b4351
5c321f81409847a0907c4b30c342217f
9f206def69bf407fbab6de7c9b70ff80
I checked the URL you pasted. If the user does not exist, the API does not return any content but still returns a successful status. That is what the error means — it expected there to be a JSON object starting at char 0.
Essentially, you need to handle the case when the response is empty before you try to execute a y.json() by checking y.content. If y.content is empty, skip processing the current username and go to the next one.
y = requests.get(url)
if len(y.content) == 0:
continue # Skip processing this username
# The rest of the code only runs if y.content is not empty.
y_data = y.json()
uuid = y_data['id']
input.csv has a list of urls in one column to check. It runs these checks perfectly and then the script is supposed to write out which sites are up and which are down. However, the last part of the code only seems to write the last site checked. How can I get it to write all checks to the output.csv?
import re
import requests
from itertools import product, cycle
import csv
from urllib.request import urlopen
with open("input.csv") as f:
for row in csv.reader(f):
s = 'Quantity' or 'Stock'
r = requests.get(row[0])
result = re.search(s, r.text)
if result == None:
print("OUT OF STOCK")
inis = "OUT OF STOCK"
else:
print("In Stock")
inis = "In Stock"
new_column = [(row[0]) + " " + inis]
print(row[0])
with open("output.csv", "w") as f:
for col in new_column:
f.write(col + "\n")
the script is supposed to write out which sites are up and which are down
I'm not sure where the 'stock' terminology comes from, but assuming the logic code is correct, why not try something like this...
Create an empty list first, then on each iteration of the loop append a row to it:
with open("input.csv") as f:
new_column = []
for row in csv.reader(f):
s = 'Quantity' or 'Stock'
r = requests.get(row[0])
result = re.search(s, r.text)
if result == None:
inis = "OUT OF STOCK"
else:
inis = "In Stock"
new_column.append(f"{row[0]},{inis}")
print(row[0], inis)
new_column is now a list of CSV compatible rows, which can be written to output.csv with your exisiting code.
I am trying to extract values from json ld to csv as they are in the file. There are a couple of issues I am facing.
1. The values being read for different fields are getting truncated in most of the cases. In the remaining cases the value of some other field is appearing in some other field.
2. I am also getting an error - 'Additional data' after some 4,000 lines.
The file is quite big(half a gb). I am attaching a shortened version of my code. Please tell me where am I going wrong.
The input file - I have shortened it and kept it here. There was no way of putting it here.
https://github.com/Architsi/json-ld-issue
I tried writing this script and I tried multiple online converters too
import csv, sys, math, operator, re, os, json, ijson
from pprint import pprint
filelist = []
for file in os.listdir("."):
if file.endswith(".json"):
filelist.append(file)
for input in filelist:
newCsv = []
splitlist = input.split(".")
output = splitlist[0] + '.csv'
newFile = open(output, 'w', newline='') #wb for windows, else you'll see newlines added to csv
# initialize csv writer
writer = csv.writer(newFile)
#Name of the columns
header_row = ('Format', 'Description', 'Object', 'DataProvider')
writer.writerow(header_row)
with open(input, encoding="utf8") as json_file:
data = ijson.items(json_file, 'item')
#passing all the values through try except
for s in data:
source = s['_source']
try:
source_resource = source['sourceResource']
except:
print ("Warning: No source resource in record ID: " + id)
try:
data_provider = source['dataProvider'].encode()
except:
data_provider = "N/A"
try:
_object = source['object'].encode()
except:
_object = "N/A"
try:
descriptions = source_resource['description']
string = ""
for item in descriptions:
if len(descriptions) > 1:
description = item.encode() #+ " | "
else:
description = item.encode()
string = string + description
description = string.encode()
except:
description = "N/A"
created = ""
#writing it to csv
write_tuple = ('format', description, _object, data_provider)
writer.writerow(write_tuple)
print ("File written to " + output)
newFile.close()
The error that I am getting is this- raise common.JSONError('Additional Data')
Expected result is a csv file with all the columns and correct values
Alright, so I need a code that will take a csv file and reads the values in it (so far I've gotten that part down).
What I'm having trouble with is creating a list with those values, and ordering them in order of less re-occurring to most re-occurring. There can be no duplicate values either.
Here's what I have:
import csv
B = []
K = []
def readandprocess(name):
with open(name, newline='') as csvf:
freader = csv.reader(csvf,delimiter=',',quotechar='"')
datasg = {}
artists = []
for row in freader:
artist = row[2]
B.append(artist)
for artist in B:
c = B.count(artist)
K.append(artist + str(c))
list(set(K))
print(K)
#for row in freader:
#artist = row[2]
###song = row[1]
#if artist == 'Rolling Stones':
# print('Rolling Stones title: ',row[1])
#if artist not in datasg:
# datasg[artist] = [song]
#else:
#datasg[artist].append(song)
#for artist in datasg:
#print(artist, datasg[artist])
print( '--------------------------------------')
info = datasg.items()
# tosort = [(len(t[1]),t[0]) for t in info]
# info = sorted(tosort)
# print(info[-30:])
# print(info)
print(len(datasg)) # currently 0, populate at will #Number of keys in dictionary
return datasg
if __name__ == '__main__':
datasg = readandprocess('data/top1000.csv')
Try using Counter. Once you have all the items you need in a list, you can use a Counter, and then call most_common(n) to get the n most common elements.
Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()