I`m trying to read and extract information of a large txt and to write it in another document, and I get this error:
Here is my code:
#Create list with PLZ, city and state
cepfinal = open("cepfinal.txt", "w") #file to be written
with open("ceptest2.txt", "r") as fp: #read file
while True:
line = fp.readline()
# print(str(line))
x = line.split("\t") #separate all that have double space
plz = x[0] #extract PLZ
# print(plz)
y = x[1]
mun = y.split("/") #separe city from state
# print(mun)
plzmun = [plz] + mun
# print(plzmun)
final = plzmun.pop(2) #remove state
plzmun = " ".join(plzmun) #create string
print(plzmun)
cepfinal.write(plzmun + "\n")
fp.close()
It is a 45 Gb file, so I suppose I have a memory issue. Can someone help me to make a lean code?
your problem is with encoding,
you can try this to solve your problem
with open("ceptest2.txt", "r", encoding="utf8") as fp:
I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.
I am reading the google spreadsheet and want to write back to the same sheet in a specific cell. Below is the code I have written.
RANGE_NAME = 'Latest Records!A2:L'
result =
service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
range=RANGE_NAME).execute()
values = result.get('values', [])
testurl = (row[1])
import requests
from HomebellAutomation.System.Config import myconfig
testurl= myconfig.URL_webpagetest
r=requests.get( myconfig.URL_webpagetest + "?url=" + testurl + "&f=xml&k=" + myconfig.apikey_webpagetest )
xmltxt=r.content
#print (xmltxt)
testId = XML (xmltxt).find("data").find('testId')
print (testId.text)
test_id = [testId.text]
I want to print the test_id back to the google spraedsheet , Please help me on this.
I am trying to get data from a website pdf with python.
The website is
https://wgfd.wyo.gov/WGFD/media/content/PDF/Hunting/Drawing%20Odds/Draw_Elk_RD_Res_2010.pdf
My code imports it all into a single string and removes the header information successfully. I am trying to build a Pandas data frame but some unusual/hidden characters are causing problems so I cant just split it into a list. I have figured out that there are 131 characters in a the first few rows but then a hidden control character disappears resulting in the row not being parsed correctly. I have tried removing the standard /n, /r, /t, and even /v with no luck. The mysterious character apparently spaces down two rows.
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
text=''
#create string
for i in range(num_pages):
pageObj = pdfReader.getPage(i)
textplus = pageObj.extractText()
firsthunt = re.findall(r'(?<!\d)\d{3}(?!\d)',textplus)
numtoskip = textplus.find(str(firsthunt[0]))
text += textplus[numtoskip:]
#replace charaters
text = text.replace(' ','|')
text = text.replace('/r','!')
text = text.replace('/n','#')
text = text.replace('/t','^')
text = text.replace('/v','*')
# del file
pdfFileObj.__del__()
os.remove(filename)
rowlength = 131
for i in range(0,10):
print(text[i*rowlength:(1+i)*rowlength])
print('New Row')
Hope this makes some sense.
Thanks
I have a shapefile (Fireincidents) that currently does not have any data. I want to populate the feature with data from a text file (north_america_fires). The text file contains the lat, long, and confidence of each fire. I've created a new insert cursor to insert new roles. I've also started a for loop to loop through each of the rows in the text file. I am having trouble identifying a method to populate the rows in the point class object. I believe I have to create a list that the data from the text file can be appended to then somehow insert that list of data into the attribute table.
My current code:
try:
work = raw_input("Enter the full path of WildlandFires.mdb: ")
arcpy.env.workspace = work
arcpy.env.overwriteOutput = True
iFile = raw_input("Enter the full path of wildfire text file: ")
fields = ["SHAPE#", "CONFIDENCEVALUE"]
cur = arcpy.da.InsertCursor("FireIncidents", fields)
f = open(iFile, 'r')
lstFires = f.readlines()
cntr = 0
for fire in lstFires:
if 'Latitude' in fire:
continue
row = line.split(',')
lstValues = []
latitude = row[0].strip()
longitude = row[1].strip()
confid = row[2].strip()
pnt = arcpy.CreateObject("Point")
lstValues.append(pnt)
f.close()
except Exception as e:
print "Error: " + str(e)
print arcpy.GetMessages()
arcpy.AddError(e)
Any guidance would be appreciated.
I just identified the issue. It does have correct lat and long coordinates. Thank you for responding.