Write to dictionary of lists to a CSV file - python

I want multiple values to be associated while it displays vertically. When I write the value key of my dictionary to writerows (w.writerows(d.values()) it goes horizontally, while I want it vertically.
from bs4 import BeautifulSoup
import csv
r = requests.get('https://www.ufc.com/rankings')
s = BeautifulSoup(r.text, 'lxml')
fighters = s.find_all('div','view-grouping')
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all('a')]
name.append(z)
divisions = x.find('h4')
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions,name))
print(d)
with open('ufc.csv', 'w', newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(d.keys())
w.writerows(d.values())

Try:
import csv
from bs4 import BeautifulSoup
r = requests.get("https://www.ufc.com/rankings")
s = BeautifulSoup(r.text, "lxml")
fighters = s.find_all("div", "view-grouping")
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all("a")]
name.append(z)
divisions = x.find("h4")
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions, name))
with open("ufc.csv", "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(d.keys())
for column in d:
d[column] = iter(d[column])
while True:
row = [next(d[column], "") for column in d]
if all(val == "" for val in row):
break
w.writerow(row)
This saves ufc.csv correctly (screenshot from LibreOffice):

Related

Loop while dynamically scraping - Python

I want to try to make realtime scrapes which have separate intervals. For example, the last data I scrape is T = 1 then it will loop once every 6 hours, T = 2 then it will loop 1 hour once and T = 3 then it will loop once every 1 minute.
But after I thought about the logic, I was confused about how to implement it.
where I make T as a reference, here is an example of the data.
[1]: https://i.stack.imgur.com/H427J.png
I will try to share the code snippet that I made.
headers = ["Year", "Month", "Day", "Hour", "Minute", "Second", "T", "Height"]
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas = []
dt = soup.find_all('textarea')[0].text
datas = dt.split('\n')[2:-1]
#membaca scrape to to array dan membaca data ke6
arr = []
arr = np.array([datas])
def listToString(s):
str1 = ""
for ele in s:
str1 += ele
return str1
coba = []
for item_list in arr:
item_string = listToString(item_list)
coba.append(item_string.split()[6])
print(coba)
#-----------------------------------------------
#perulangan interval data T
while True:
if coba[0] == 1:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 2:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 3:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break

Summing values from duplicate keys in a CSV file without panda

I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)

Web Scraping through multiple urls

I have the code I would like for the content I need, however i would like to run through all the gameId's that have played so far instead of just the one in the URL. I would like to change 2017020001 and make it go through to 2017021272 or till the end of the season which is around 1272 i believe. How can that be done with the code below?
import csv
import requests
import os
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()
If the game ids are numbered sequentially then it would be as simple as nesting all your code under a for loop that iterates through all the game ids and using str.format() add the necessary padding to the number in this case some parts would change:
import csv
import requests
import os
for i in range(1, 1273):
url = 'https://statsapi.web.nhl.com/api/v1/game/201702{:04d}/feed/live?site=en_nhl'.format(i)
req = requests.get(url)
req.raise_for_status()
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_201702{:04d}.csv".format(i), "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
One last correction is that using with ... as makes it so that you don't need to close the file explictily.
You can find additional information on using str.format() here
You should iterate over your code using a for-loop
Something like this should work:
import csv
import requests
import os
for x in range(2017020001, 2017021273):
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/%s/feed/live?site=en_nhl' % x)
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()

How to convert csv to json in python?

I'm very new to programming, have been learning python from past 3/4 weeks and
this is one of the assignments given.
Input
A, B, C, D
1, 2, 3, 4
5, 6, 7, 8
Output
{{A:"1", B:"2", C:"3", D:"4"}, {A:"5", B:"6", C:"7", D:"8"}}
I've been trying with the code as:
import csv
import json
csvfile = open('test.csv','r')
jsonfile = open('test.json','w')
x = ("a","b","c","d")
reader = csv.DictReader(csvfile, x)
for row in reader:
json.dump(row, jsonfile)
The output for this code comes as below:
{"a": "1", "null": ["5", "6", "7", "8", "9"], "c": "3", "b": "2", "d": "4"}
Can anyone help me on this?
Dump after processing whole rows.
import csv
import json
with open('test.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
with open('test.json', 'w') as f:
json.dump(rows, f)
For those who like one-liners:
import csv
import json
json_data = [json.dumps(d) for d in csv.DictReader(open('file.csv'))]
Checkout this fiddle for a working example:
https://pyfiddle.io/fiddle/5992b8f4-552f-4970-91b6-a52cdee16ebc/?i=true
import csv
import json
# Constants to make everything easier
CSV_PATH = './csv.csv'
JSON_PATH = './json'
# Reads the file the same way that you did
csv_file = csv.DictReader(open(CSV_PATH, 'r'))
# Created a list and adds the rows to the list
json_list = []
for row in csv_file:
json_list.append(row)
# Writes the json output to the file
file(JSON_PATH, 'w').write(json.dumps(json_list))
Convert CSV to Json Python
import csv
import urllib2
url = '<YOURCSVURL>'
response = urllib2.urlopen(url)
cr = csv.reader(response)
line = {}
data = []
for index, row in enumerate(cr):
if index:
for index, col in enumerate(row):
line[name[index]] = col
data.append(line.copy())
else:
name = row
print data
You can attempt it using this code :
def inputfunction(lists):
tmpdict = {}
for element_index in range(len(lists)):
tmpdict[headers[elementindex]] = lists[element_index]
return tmpdict
def run(filename):
filelist = [eachline.split(',') for eachline in open(inputfile,'r')]
headers = filelist[0]
values = filelist[1:]
finallist = []
for lists in values:
finallist.append(inputfunction(lists))
return finallist

scraping a table then writing to csv

I am pretty new to python and beautiful soup. this is my first 'real' project. I am trying to scrape some info from a website. So far I have been semi-successful. I have identified the table and got python to print out the relevant information pretty nicely.
I am stuck with writing that information python prints to a usable csv file.
here is what I have for my code. to get python to print the info I need.
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
print 'User Name:' + a
print 'Source:' + b
print 'Staff:' + c
print 'Location:' + d
print 'Attended On:' + e
print 'Used:' + f
print 'Date:' + g + '\n'
except:
print 'bad string'
continue
Here is a more succinct way to collect your data:
columns = ["User Name", "Source", "Staff", "Location", "Attended On", "Used", "Date"]
table = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
data = [td.get_text() for td in tds]
for field,value in zip(columns, data):
print("{}: {}".format(field, value))
table.append(data)
except:
print("Bad string value")
and you can then write to csv as
import csv
with open("myfile.csv", "wb") as outf: # Python 2.x
# with open("myfile.csv", "w", newline="") as outf: # Python 3.x
outcsv = csv.writer(outf)
# header row
outcsv.writerow(columns)
# data
outcsv.writerows(table)
You could append a thru g to a list within a list for each iteration of the loop. Then use this:
my_list = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
my_list.append([a,b,c,d,e,f,g])
Then:
import csv
with open('output_table.csv', 'wb') as csvfile:
wr= csv.writer(csvfile,lineterminator = '\n')
wr.writerows(my_list)

Categories

Resources