This script almost works. However it never matches, and when it does, the values are incorrect. Example:
no match
Lower 117, $331.50, F, 8, 193
Upper 218, $155.00, AA, 8, 195
match
Floor 6, $273.00, N, 2, 195
SECTION,PRICE,ROW,QTY,DYSLSTED
So I'm not sure why it isn't working. After all the values from the html file are loaded in the first time, the program should only output match for even list, since they would all be in the csv. But when I run it in the current configuration, the opposite result is true.
HTML FILE eagles.html is here
Here is my script:
import os
import sys
from bs4 import BeautifulSoup
import lxml.html as lh
import csv
soup = BeautifulSoup(open("eagles.html"), "lxml")
###################################################################
variable = 'test_csv_1' ########DELETE
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = (dir_path+'\Sheets')
try:
os.makedirs(file_path)
except:
pass
#######################
for mytable in soup.find_all('table'):
for trs in mytable.find_all('tr'):
tds = trs.find_all('td')
row1 = [elem.text.strip() for elem in tds]
row = str(row1)
cool = row.replace("[", "")
coolp = cool.replace("]", "")
cool2 = coolp.replace("'", "")
cool3 = cool2.replace(" , ", "")
row = cool3
rowtest = (row.split(','))
if len(rowtest) != 5:
rowtest = ['NULL', 'NULL', 'NULL', 'NULL', 'NULL']
###TABLE STUFF###
rowtest0 = rowtest[:4] # LISTING WITHOUT DAYS LISTED
rowtest1 = rowtest[0:1] # SECTION LOCATION
rowtest2 = rowtest[1:2] # TICKET PRICE
rowtest3 = rowtest[2:3] # ROW
rowtest4 = rowtest[3:4] # TICKET QTY
rowtest5 = rowtest[4:5] # DAYS LISTED
###TABLE STUFF#
###CREATE CSV HEADER###
with open(file_path+'\\'+variable+'.csv', 'a+') as headercsv:
if os.stat(file_path+'\\'+variable+'.csv').st_size == 0:
writer = csv.writer(headercsv)
writer.writerow(["SECTION", "PRICE", "ROW", "QTY", "DYSLSTED"])
print('CREATED HEADERS FOR NEW FILE')
else:
pass
###WRITE TO CSV###
with open(file_path+'\\'+variable+'.csv', 'r') as rowin:
if rowtest == ['NULL', 'NULL', 'NULL', 'NULL', 'NULL']:
continue
else:
pass
for boogie in rowin:
if row in boogie:
print(row)
print(boogie)
print('match')
break
else:
print(row)
print(boogie)
print('no match')
with open(file_path+'\\'+variable+'.csv', 'a+') as ruts:
writer = csv.writer(ruts)
writer.writerow(rowtest)
Related
I have a csv file that is generated that has some information in the first line. I'm trying to skip it but it doesn't seem to work. I tried looking at several suggestions and examples.
I tried using skiprows.
I also looked at several other examples.
Pandas drop first columns after csv read
https://datascientyst.com/pandas-read-csv-file-read_csv-skiprows/
Nothing I tried worked the way I wanted it.
When I got it to work it deleted the entire row.
Here is a sample of the code
# Imports the Pandas Module. It must be installed to run this script.
import pandas as pd
# Gets source file link
source_file = 'Csvfile.csv'
# Gets csv file and encodes it into a format that is compatible.
dataframe = pd.read_csv(source_copy, encoding='latin1')
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Grayscale, 'Duplex': dataframe.Duplex, 'Printer': dataframe.Printer})
# Formats data so that it can be used to count Duplex and Color pages.
df.loc[df["Duplex"] == "DUPLEX", "Duplex"] = dataframe.Pages
df.loc[df["Duplex"] == "NOT DUPLEX", "Duplex"] = 0
df.loc[df["Color"] == "NOT GRAYSCALE", "Color"] = dataframe.Pages
df.loc[df["Color"] == "GRAYSCALE", "Color"] = 0
df.sort_values(by=['User', 'Pages'])
file = df.to_csv('PrinterLogData.csv', index=False)
# Opens parsed CSV file.
output_source = "PrinterLogData.csv"
dataframe = pd.read_csv(output_source, encoding='latin1')
# Creates new DataFrame.
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Color, 'Duplex': dataframe.Duplex, 'Printer':
dataframe.Printer})
# Groups data by Users and Printer Sums
Report1 = df.groupby(['User'], as_index=False).sum().sort_values('Pages', ascending=False)
Report2 = (df.groupby(['Printer'], as_index=False).sum()).sort_values('Pages', ascending=False)
Sample Data
Sample Output of what I'm looking for.
This is an early draft of what you appear to want for your program (based on the simulated print-log.csv):
import csv
import itertools
import operator
import pathlib
CSV_FILE = pathlib.Path('print-log.csv')
EXTRA_COLUMNS = ['Pages', 'Grayscale', 'Color', 'Not Duplex', 'Duplex']
def main():
with CSV_FILE.open('rt', newline='') as file:
iterator = iter(file)
next(iterator) # skip first line if needed
reader = csv.DictReader(iterator)
table = list(reader)
create_report(table, 'Printer')
create_report(table, 'User')
def create_report(table, column_name):
key = operator.itemgetter(column_name)
table.sort(key=key)
field_names = [column_name] + EXTRA_COLUMNS
with pathlib.Path(f'{column_name} Report').with_suffix('.csv').open(
'wt', newline=''
) as file:
writer = csv.DictWriter(file, field_names)
writer.writeheader()
report = []
for key, group in itertools.groupby(table, key):
report.append({column_name: key} | analyze_group(group))
report.sort(key=operator.itemgetter('Pages'), reverse=True)
writer.writerows(report)
def analyze_group(group):
summary = dict.fromkeys(EXTRA_COLUMNS, 0)
for row in group:
pages = int(row['Pages']) * int(row['Copies'])
summary['Pages'] += pages
summary['Grayscale'] += pages if row['Grayscale'] == 'GRAYSCALE' else 0
summary['Color'] += pages if row['Grayscale'] == 'NOT GRAYSCALE' else 0
summary['Not Duplex'] += pages if row['Duplex'] == 'NOT DUPLEX' else 0
summary['Duplex'] += pages if row['Duplex'] == 'DUPLEX' else 0
return summary
if __name__ == '__main__':
main()
I have a script which produces multiple .csv files and each .csv file has its own name which is a variable. I am trying to save these files to a specific path instead of saving them to the Python folder.
I have tried this tutorial Specify path in write.csv function but it gave me this error: NameError: name 'file' is not defined and I tried to find other people who had the same issue when using write.csv but was unable to find any
I am on MacOS
Here is the code:
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open, file.path(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
I have tried multiple examples of writing csv to file path and have had 0 success. If anyone has any ideas or suggestions I'd love to hear them.
Here is my full code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
headers = []
datarows = []
# define 1-1-2020 as a datetime object
after_date = datetime(2020, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-20.html')
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
# check to make sure the date field isn't empty
if last_out_str != "":
# load date into datetime object for comparison (second part is defining the layout of the date as years-months-days hour:minute:second timezone)
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z")
# if check to see if the date is after 2020/1/1
if last_out > after_date:
address_links.append(url)
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
#Get the Doge Address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin', '')
finalitem = newitem.replace('Address', '')
finalitem = finalitem.replace(' ', '')
#Get the profit
sections = soup.find_all(class_='table-striped')
for section in sections:
oldprofit = section.find_all('td')[11].text
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
# Compare profit to goal
goal = float(50000)
if profit < goal:
continue
if table:
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
You can have automatic file closing using a with statement:
with open(f'{finalitem}.csv', 'w', newline='') as csvfile:
fcsv = csv.DictWriter(csvfile, fieldnames=headers) # assuming headers is a list object
fcsv.writeheader()
fcsv.writerows(datarows)
The error that you are having is that you are wrapping file.path() with elements that should be part of the open function and that you may be wanting to refer to the path variable instead of a path() function for your naming/downloading path.
I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.
Here is my situation: My code parses out data from HTML tables that are within emails. The roadblock I'm running into is that some of these tables have blank empty rows right in the middle of the table, as seen in the photo below. This blank space causes my code to fail (IndexError: list index out of range) since it attempts to extract text from the cells.
Is it possible to say to Python: "ok, if you run into this error that comes from these blank rows, just stop there and take the rows you have acquired text from so far and execute the rest of the code on those"...?
That might sound like a dumb solution to this problem but my project involves me taking data from only the most recent date in the table anyway, which is always amongst the first few rows, and always before these blank empty rows.
So if it is possible to say "if you hit this error, just ignore it and proceed" then I would like to learn how to do that. If it's not then I'll have to figure out another way around this. Thanks for any and all help.
The table with the gap:
My code:
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle
dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="user", password="password", dsn=dsnStr)
def celltext(cell):
'''
textlist=[]
for br in cell.findAll('br'):
next = br.nextSibling
if not (next and isinstance(next,NavigableString)):
continue
next2 = next.nextSibling
if next2 and isinstance(next2,Tag) and next2.name == 'br':
text = str(next).strip()
if text:
textlist.append(next)
return (textlist)
'''
textlist=[]
y = cell.find('span')
for a in y.childGenerator():
if isinstance(a, NavigableString):
textlist.append(str(a))
return (textlist)
path = 'Z:\\blub_2'
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
html=open(file_path,'r').read()
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
table = soup.find_all('table')[1] # Grab the second table
df_Quota = pd.DataFrame()
for row in table.find_all('tr'):
columns = row.find_all('td')
if columns[0].get_text().strip()!='ID': # skip header
Quota = celltext(columns[1])
Weight = celltext(columns[2])
price = celltext(columns[3])
print(Quota)
Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows
IDList = [columns[0].get_text()] * Nrows
DateList = [columns[4].get_text()] * Nrows
if price[0].strip()=='Package':
price = [columns[3].get_text()] * Nrows
if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
lstnans= [np.nan]*(len(Weight)-len(Quota))
Quota.extend(lstnans)
if len(price) < len(Quota): #if price column has less items than quota column,
val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with
price.extend(val) #whatever is in
#price column
#if len(DateList) > len(Quota): #if DateList is longer than Quota,
#print("it's longer than")
#value = [columns[4].get_text()] * (len(DateList)-len(Quota))
#DateList = value * Nrows
if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
stu = [np.nan]*(len(DateList)-len(Quota)) #extend with NaN
Quota.extend(stu)
if len(Weight) < len(DateList):
dru = [np.nan]*(len(DateList)-len(Weight))
Weight.extend(dru)
FinalDataframe = pd.DataFrame(
{
'ID':IDList,
'AvailableQuota': Quota,
'LiveWeightPounds': Weight,
'price':price,
'DatePosted':DateList
})
df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
#df_Quota = df_Quota.loc[df_Quota['DatePosted']=='5/20']
df_Q = df_Quota['DatePosted'].iloc[0]
df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q]
print (df_Quota)
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
with open(file_path, 'r') as f:
pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
email = f.read()
dates = pattern.findall(email)
if dates:
print("Date:", ''.join(dates))
#cursor = con.cursor()
#exported_data = [tuple(x) for x in df_Quota.values]
#sql_query = ("INSERT INTO ROUGHTABLE(species, date_posted, stock_id, pounds, money, sector_name, ask)" "VALUES (:1, :2, :3, :4, :5, 'NEFS 2', '1')")
#cursor.executemany(sql_query, exported_data)
#con.commit()
#cursor.close()
#con.close()
continue is the keyword to use for skipping empty/problem rows. IndexError is thanks to the attempt to access columns[0] on an empty columns list. So just skip to next row when there is an exception.
for row in table.find_all('tr'):
columns = row.find_all('td')
try:
if columns[0].get_text().strip()!='ID':
# Rest as above in original code.
except IndexError:
continue
Use try: ... except: ...:
try:
#extract data from table
except IndexError:
#execute rest of program
I would like to have the information captured in the web by different URLs (I have them in a list called "cod") written to a CSV file, row by row (for export to Excel).
I have tried with just one link, but if I want to do it with all the elements of the List, I'd need to iterate, and am having difficulty.
My code:
import urllib
from bs4 import BeautifulSoup
import csv
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
loong = len(cod)
i = 0
sock = urllib.urlopen(urlfixed + "Cod=" + cod[i])
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'w')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
So, one row with the info from one "cod", and that should makes 10 lines in the "tomate.csv".
Just use a for loop with the iterator iterating through the list cod and you are opening the file for writing when it should have been append :
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
for i in cod:
sock = urllib.urlopen(urlfixed + "Cod=" + i)
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'ab')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
#the loop ends here