scraping a table then writing to csv - python

I am pretty new to python and beautiful soup. this is my first 'real' project. I am trying to scrape some info from a website. So far I have been semi-successful. I have identified the table and got python to print out the relevant information pretty nicely.
I am stuck with writing that information python prints to a usable csv file.
here is what I have for my code. to get python to print the info I need.
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
print 'User Name:' + a
print 'Source:' + b
print 'Staff:' + c
print 'Location:' + d
print 'Attended On:' + e
print 'Used:' + f
print 'Date:' + g + '\n'
except:
print 'bad string'
continue

Here is a more succinct way to collect your data:
columns = ["User Name", "Source", "Staff", "Location", "Attended On", "Used", "Date"]
table = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
data = [td.get_text() for td in tds]
for field,value in zip(columns, data):
print("{}: {}".format(field, value))
table.append(data)
except:
print("Bad string value")
and you can then write to csv as
import csv
with open("myfile.csv", "wb") as outf: # Python 2.x
# with open("myfile.csv", "w", newline="") as outf: # Python 3.x
outcsv = csv.writer(outf)
# header row
outcsv.writerow(columns)
# data
outcsv.writerows(table)

You could append a thru g to a list within a list for each iteration of the loop. Then use this:
my_list = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
my_list.append([a,b,c,d,e,f,g])
Then:
import csv
with open('output_table.csv', 'wb') as csvfile:
wr= csv.writer(csvfile,lineterminator = '\n')
wr.writerows(my_list)

Related

Write to dictionary of lists to a CSV file

I want multiple values to be associated while it displays vertically. When I write the value key of my dictionary to writerows (w.writerows(d.values()) it goes horizontally, while I want it vertically.
from bs4 import BeautifulSoup
import csv
r = requests.get('https://www.ufc.com/rankings')
s = BeautifulSoup(r.text, 'lxml')
fighters = s.find_all('div','view-grouping')
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all('a')]
name.append(z)
divisions = x.find('h4')
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions,name))
print(d)
with open('ufc.csv', 'w', newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(d.keys())
w.writerows(d.values())
Try:
import csv
from bs4 import BeautifulSoup
r = requests.get("https://www.ufc.com/rankings")
s = BeautifulSoup(r.text, "lxml")
fighters = s.find_all("div", "view-grouping")
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all("a")]
name.append(z)
divisions = x.find("h4")
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions, name))
with open("ufc.csv", "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(d.keys())
for column in d:
d[column] = iter(d[column])
while True:
row = [next(d[column], "") for column in d]
if all(val == "" for val in row):
break
w.writerow(row)
This saves ufc.csv correctly (screenshot from LibreOffice):

Summing values from duplicate keys in a CSV file without panda

I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)

Write variable output to a specific column in a CSV?

I'm working on a Python script that scrapes data from an Excel doc, then writes the output to a .csv.
I was able to grab the data and get it to write to the .csv, but all of the data goes into the first column.
I need the bar data to go into the 4th and the foo to go into the 5th column, so I tried to use csv.reader to select the row, and this runs without error but doesn't actually write to the .csv file.
Here's my code:
import xlrd
import csv
###Grab the data
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'map_test.xlsx'
output = []
output_bar = []
output_foo = []
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('Sheet1')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
print len(column_names)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append([this_row[0], column_names[x]])
output_bar.append([column_names[x]])
output_foo.append([this_row[0]])
print output
myData = [["number", "name", "version", "bar",
"foo"]]
##### Next section is the code in question, it
####doesn't error out, but won't write to the .csv######
myFile = open("test123.csv", "w")
writer = csv.writer(myFile)
with open('test123.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
row[5] = myFile.readline()
writer.writerows(output_foo)
row[4] = myFile.readline()
writer.writerows(outpu_bar)
#####This successfully writes to the csv, but
#####all data to first column#####
# myFile = open('test123.csv', 'w')
# with myFile:
# writer = csv.writer(myFile)
# writer.writerows(myData)
# #writer.writerows(output)
# writer.writerows(output_foo)
# writer.writerows(output_bar)
x += 1
print ("CSV Written")

How to iterate over URLs for writing CSV

I would like to have the information captured in the web by different URLs (I have them in a list called "cod") written to a CSV file, row by row (for export to Excel).
I have tried with just one link, but if I want to do it with all the elements of the List, I'd need to iterate, and am having difficulty.
My code:
import urllib
from bs4 import BeautifulSoup
import csv
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
loong = len(cod)
i = 0
sock = urllib.urlopen(urlfixed + "Cod=" + cod[i])
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'w')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
So, one row with the info from one "cod", and that should makes 10 lines in the "tomate.csv".
Just use a for loop with the iterator iterating through the list cod and you are opening the file for writing when it should have been append :
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
for i in cod:
sock = urllib.urlopen(urlfixed + "Cod=" + i)
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'ab')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
#the loop ends here

Search for string in CSV Files using python and write the results

#!/usr/bin/python
import csv
import re
string_1 = ('OneTouch AT')
string_2 = ('LinkRunner AT')
string_3 = ('AirCheck')
#searched = ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
print "hello Pythong! "
#def does_match(string):
# stringl = string.lower()
# return any(s in stringl for s in searched)
inFile = open('data.csv', "rb")
reader = csv.reader(inFile)
outFile = open('data2.csv', "wb")
writer = csv.writer(outFile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
for row in reader:
found = False
for col in row:
if col in [string_1, string_2, string_3] and not found:
writer.writerow(row)
found = True
#for row in reader:
# if any(does_match(col) for col in row):
# writer.writerow(row[:2]) # write only 2 first columns
inFile.close()
outFile.close()
I'm trying to figure out how to search a CSV file for 3 items. If those items exist print the row. Ideally I would like only Columns 1 and 3 to print to a new file.
Sample Data File
LinkRunner AT Video,10,20
Wireless Performance Video OneTouch AT,1,2
Wired OneTouch AT,200,300
LinkRunner AT,200,300
AirCheck,200,300
I'm trying to figure out how to search a CSV file for 3 items. If
those items exist print the row. Ideally I would like only Columns 1
and 3 to print to a new file.
Try this:
import csv
search_for = ['OneTouch AT','LinkRunner AT','AirCheck']
with open('in.csv') as inf, open('out.csv','w') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in reader:
if row[0] in search_for:
print('Found: {}'.format(row))
writer.writerow(row)
#!/usr/bin/python
import csv
import numpy as np
class search_csv(object):
def __init__(self, infile, outfile):
infile = open(infile, 'rb')
read_infile = [i for i in csv.reader(infile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)]
self.non_numpy_data = read_infile
self.data = np.array(read_infile, dtype=None)
self.outfile = open(outfile, 'wb')
self.writer_ = csv.writer(self.outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
def write_to(self, matched_values):
self.writer_.writerows(matched_values)
print ' Matched Values Written '
return True
def searcher(self, items, return_cols=[0,2]): ##// items should be passed as list -> ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
find_these = np.array(items, dtype=None)
matching_y = np.in1d(self.data, find_these).reshape(self.data.shape).nonzero()[0]
matching_data = self.data[matching_y][:,return_cols]
self.write_to(matching_data)
self.outfile.close()
return True
def non_numpy_search(self, items, return_cols=[0,2]):
lst = []
for i in self.non_numpy_data:
for ii in items:
if ii in i:
z = []
for idx in return_cols:
z.append(i[idx])
lst.append(z)
break
self.write_to(lst)
return True
### now use the class ###
SEARCHING_FOR = ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
IN_FILE = 'in_file.csv'
OUT_FILE = 'out_file.csv'
non_numpy_search(IN_FILE, OUT_FILE).non_numpy_search(SEARCHING_FOR)
By the phrasing of your question I'm assuming you just want to complete the task at hand and don't really care how. So copy and paste this in and use your data file as the 'IN_FILE' value and the file name you want to write to as the 'OUT_FILE' value. Place the values you want to search for in the 'SEARCHING_FOR' list as you're done.
Things to note....
SEARCHING_FOR should be a list.
the values in SEARCHING_FOR are matched EXACTLY so 'A' will not match 'a'. If you want a to use a regex or something more complex let me know.
In function 'non_numpy_search' there is a 'return_cols' parameter. It defaults to the first and 3rd column.
If you don't have numpy let me know.
#!/usr/bin/python
import csv
import re
import sys
import gdata.docs.service
#string_1 = ('OneTouch AT')
#string_2 = ('LinkRunner AT')
#string_3 = ('AirCheck')
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
def does_match(string):
stringl = string.lower()
return any(s in stringl for s in searched)
#Opens Input file for read and output file to write.
inFile = open('data.csv', "rb")
reader = csv.reader(inFile)
outFile = open('data2.csv', "wb")
writer = csv.writer(outFile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
#for row in reader:
# found = False
# for col in row:
# if col in [string_1, string_2, string_3] and not found:
# writer.writerow(row)
# found = True
"""Built a list of items to sort. If row 12 contains 'LinkRunner AT' (group 1),
one stores a triple (1, 12, row)
When the triples are sorted later, all rows in group 0 will come first, then
all rows in group 1, etc.
"""
stored = []
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow(tuple(row[k] for k in (0,2))) # output col 1 & 5
#for row in reader:
# if any(does_match(col) for col in row):
# writer.writerow(row[:2]) # write only 2 first columns
# Closing Input and Output files.
inFile.close()
outFile.close()

Categories

Resources