Related
I've been working on this lab project to take in some tsv values of a "grade book" run some calculations, and finally output them to a new report.txt file but whenever I get to the end to print out my report.txt the format of the file doesn't match what I'd expect from a row by row tsv file and is instead a giant list.
So my question is how can I convert my list into a proper line by line tsv like below?
Barrett Edan 70 45 59 F
Bradshaw Reagan 96 97 88 A
Charlton Caius 73 94 80 B
Mayo Tyrese 88 61 36 D
Stern Brenda 90 86 45 C
Averages: midterm1 83.40, midterm2 76.60, final 61.60
My input is commented out in the StudentInfo.tsv= below
My current output is listed below the code.
# TODO: Declare any necessary variables here.
import csv
studentgrades=[]
s_grades=[]
all_grades=[]
all_grades_computed=[]
# TODO: Read a file name from the user and read the tsv file here.
#StudentInfo.tsv= Barrett Edan 70 45 59
# Bradshaw Reagan 96 97 88
# Charlton Caius 73 94 80
# Mayo Tyrese 88 61 36
# Stern Brenda 90 86 45
with open('StudentInfo.tsv', 'r') as file:
studentgrades = csv.reader(file, delimiter='\t')
sgrades=list(studentgrades)
for row in sgrades:
avg2=0
rowstr=' '
for i in row[2:]:
avg2+=int(i)
#print(i)
#print(avg2)
studentavg=float(int(avg2)/len(row[2:]))
if studentavg >= 90:
s_grades.append('A')
elif 80<=studentavg<90:
s_grades.append('B')
elif 70<=studentavg<80:
s_grades.append('C')
elif 60<=studentavg<70:
s_grades.append('D')
else:
s_grades.append('F')
print('{} average: {:.2f}'.format(rowstr.join(row[0:2]), studentavg))
#print('test length',len(row[2:]))
#print(row[2:])
print()
print(s_grades)
print(sgrades)
# for i in s_grades:
# sgrades.append(i)
# print(sgrades)
for i in range(len(s_grades)):
all_grades.append(str(sgrades[i])+str(s_grades[i]))
print(all_grades)
print()
# TODO: Compute student grades and exam averages, then output results to a text file here.
student1=sgrades[0]
student2=sgrades[1]
student3=sgrades[2]
student4=sgrades[3]
student5=sgrades[4]
print('Averages:',end=' ')
midterm1=(int(student1[2])+int(student2[2])+int(student3[2])+int(student4[2])+int(student5[2]))/len(sgrades)
#print(student1[2])
print('midterm1','{:.2f},'.format(midterm1),end=' ')
midterm2=(int(student1[3])+int(student2[3])+int(student3[3])+int(student4[3])+int(student5[3]))/len(sgrades)
#print(student1[3])
print('midterm2','{:.2f},'.format(midterm2),end=' ')
final=(int(student1[4])+int(student2[4])+int(student3[4])+int(student4[4])+int(student5[4]))/len(sgrades)
print('final','{:.2f}'.format(final))
all_grades_computed=['Averages:','midterm1','{:.2f},'.format(midterm1),'midterm2','{:.2f},'.format(midterm2), 'final','{:.2f}'.format(final)]
with open('report.txt', 'w+') as report:
csv_writer=csv.writer(report, delimiter='\t')
csv_writer.writerow(all_grades)
csv_writer.writerow(all_grades_computed)
with open('report.txt','r') as report:
reports=csv.reader(report, delimiter='\t')
reportlist=list(reports)
print(reportlist)
Barrett Edan average: 58.00
Bradshaw Reagan average: 93.67
Charlton Caius average: 82.33
Mayo Tyrese average: 61.67
Stern Brenda average: 73.67
['F', 'A', 'B', 'D', 'C']
[['Barrett', 'Edan', '70', '45', '59'], ['Bradshaw', 'Reagan', '96', '97', '88'], ['Charlton', 'Caius', '73', '94', '80'], ['Mayo', 'Tyrese', '88', '61', '36'], ['Stern', 'Brenda', '90', '86', '45']]
["['Barrett', 'Edan', '70', '45', '59']F", "['Bradshaw', 'Reagan', '96', '97', '88']A", "['Charlton', 'Caius', '73', '94', '80']B", "['Mayo', 'Tyrese', '88', '61', '36']D", "['Stern', 'Brenda', '90', '86', '45']C"]
Averages: midterm1 83.40, midterm2 76.60, final 61.60
[["['Barrett', 'Edan', '70', '45', '59']F", "['Bradshaw', 'Reagan', '96', '97', '88']A", "['Charlton', 'Caius', '73', '94', '80']B", "['Mayo', 'Tyrese', '88', '61', '36']D", "['Stern', 'Brenda', '90', '86', '45']C"], ['Averages:', 'midterm1', '83.40,', 'midterm2', '76.60,', 'final', '61.60']]
Using loop to collect target data into lists from JSON file. These lists are organized as columns and their values are organized; thus, no manipulation/reorganization is required. Only attaching them horizontally.
#Selecting Data into List
i=1
target = f'{pathway}\calls_{i}.json'
with open(target,'r') as f: #Reading JSON file
data = json.load(f)
specsA=('PreviousDraws',['DrawNumber'])
draw=(glom(data,specsA)) #list type; glom is a package to access nested data in JSON file.
print(draw)
for j in range(0,5):
specsB=('PreviousDraws',['WinningNumbers'],[f'{j}'],['Number'])
number=(glom(data,specsB)) #list type; glom is a package to access nested data in JSON file.
print(number)
#Now assembling lists into a table using pandas
The resulting lists from the code above are as followed below:
#This is from variable draw
[10346, 10345, 10344, 10343, 10342, 10341, 10340, 10339, 10338, 10337, 10336, 10335, 10334, 10333, 10332, 10331, 10330, 10329, 10328, 10327]
#This is from variable number
['22', '9', '4', '1', '1', '14', '5', '3', '2', '8', '2', '1', '4', '9', '4', '4', '3', '13', '7', '14']
['28', '18', '16', '2', '3', '17', '16', '13', '11', '9', '8', '2', '9', '19', '7', '13', '7', '23', '21', '17']
['33', '24', '21', '4', '9', '20', '27', '19', '23', '19', '19', '7', '19', '30', '19', '27', '19', '32', '26', '21']
['35', '30', '28', '11', '21', '23', '33', '26', '35', '37', '27', '12', '20', '31', '22', '34', '22', '36', '27', '25']
['36', '32', '33', '19', '29', '38', '35', '27', '37', '38', '32', '30', '22', '36', '33', '39', '36', '38', '30', '27']
Expected Data Frame table after assembly:
Draw | Number[0] | Number[1] | Number[2] ...
10346 | 22 | 28 |
10345 | 9 | 18 |
10344 | 4 | 16 |
10343 | 1 | 2 |
10342 | 1 | 3 |
My attempt at assembling the table: Organize as dictionary with Series, below:
dct = {'DrawNumbers':pd.Series(draw),
'Index1':pd.Series(number),
'Index2':pd.Series(number),
'Index3':pd.Series(number),
'Index4':pd.Series(number),
'Index5':pd.Series(number)
}
df = pd.DataFrame(dct)
print(df)
Actual result - incorrect due to last list's value being repeated in table's row. So far, only Index5 column is correct, while all index columns are incorrectly represented with index 5's values.
DrawNumbers Index1 Index2 Index3 Index4 Index5
0 10346 36 36 36 36 36
1 10345 32 32 32 32 32
2 10344 33 33 33 33 33
3 10343 19 19 19 19 19
4 10342 29 29 29 29 29
5 10341 38 38 38 38 38
6 10340 35 35 35 35 35
7 10339 27 27 27 27 27
8 10338 37 37 37 37 37
9 10337 38 38 38 38 38
... ... ... ... ... ... ...
Also had tried to change the data type of the number from string to int, but having repeated errors attempted that. Either way, I am stuck and would like to request for assistance.
The problem is that you are overwriting the number variable in the loop, so is no longer available at the end of each iteration, I add a solution adding the column Index in each iteration.
# create an empty dataframe
df = pd.DataFrame()
#Selecting Data into List
i=1
target = f'{pathway}\calls_{i}.json'
with open(target,'r') as f: #Reading JSON file
data = json.load(f)
specsA=('PreviousDraws',['DrawNumber'])
draw=(glom(data,specsA)) #list type; glom is a package to access nested data in JSON file.
print(draw)
# insert the draw to the dataframe
df['DrawNumbers'] = draw
for j in range(0,5):
specsB=('PreviousDraws',['WinningNumbers'],[f'{j}'],['Number'])
number=(glom(data,specsB)) #list type; glom is a package to access nested data in JSON file.
print(number)
# insert each number to the dataframe
df[f'Index{j}'] = number
Assuming that number is a nested list:
number = list(map(list, zip(*number))) # this transposes the nested list so that each list within the list now corresponds to one row of the desired df
pd.DataFrame(data=number, index=draw)
This will output the df in the desired format. Of course you can go ahead and label the columns as you like, etc.
I have the following merging problem:
I have a time series of industry related data: weekly Profit Margins for 60 different industries over multiple years, which looks like this:
industry = pd.DataFrame({'Ind0': ['01', '02', '03', '04'],
'Ind1': ['11', '12', '13', '14'],
'Ind2': ['21', '22', '23', '24'],
'Ind3': ['31', '32', '33', '34']})
My 2nd dataframe consists of a few 1,000 stocks and their respective industries (each stock belongs to exactly one industry)
stocks = pd.DataFrame({'Stock0': ['Ind0'],
'Stock1': ['Ind1'],
'Stock2': ['Ind2'],
'Stock3': ['Ind3'],
'Stock4': ['Ind0'],
'Stock5': ['Ind1']})
I would like to create a new dataframe that contains the industry time series for each stock coming from the correct industry that the stock belongs to, i.e. something like this:
result = pd.DataFrame({'Stock0': ['01', '02', '03', '04'],
'Stock1': ['11', '12', '13', '14'],
'Stock2': ['21', '22', '23', '24'],
'Stock3': ['31', '32', '33', '34'],
'Stock4': ['01', '02', '03', '04'],
'Stock5': ['11', '12', '13', '14']})
I have tried a number of merge/concatenate approaches without success. Any help is appreciated.
Is this what you want?
stocks.T.merge(industry.T,left_on=0,right_index=True).drop(['key_0','0_x'],axis=1).rename(columns={'0_y':0}).T
Out[189]:
Stock0 Stock4 Stock1 Stock5 Stock2 Stock3
0 01 01 11 11 21 31
1 02 02 12 12 22 32
2 03 03 13 13 23 33
3 04 04 14 14 24 34
im trying to caputre numbers from that kind of a string:
"30098.904999 5 ABC Da d 8 06 01 20 00 80 11 C0 04"
first i remove all whitespaces:
test = ' '.join(test.split())
then im trying to apply pattern:
pattern = r"(\d+.\d+) (\d+) ABC Da d 8 (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) (\d\d) (\d\d)"
however, still got none result:
result = re.search(pattern, s)
print ("result: " + str(result.groups(0)))
print ("result: " + str(result.groups(0)))
AttributeError: 'NoneType' object has no attribute 'groups'
if i change frist number to 50.309951, then it works.
First number is a timestamp and amount of numbers in it can vary..
Any help highly welcomed! :)
thx in advance
j.
Why wouldn't you just split the string after removing spaces chars?
test = ' '.join(test.split())
like this?
You will receive an array of items
['30098.904999', '5', 'ABC', 'Da', 'd', '8', '06', '01', '20', '00', '80', '11', 'C0', '04']
That's because of C0 which not matched with \d\d. You can use \d\w for that part. But as a more general approach you can use re.findall() to capture all numbers:
In [24]: test = "30098.904999 5 ABC Da d 8 06 01 20 00 80 11 C0 04"
In [27]: re.findall(r'\d+(?:\.\d+)?', test)
Out[27]: ['30098.904999', '5', '8', '06', '01', '20', '00', '80', '11', '0', '04']
# If you want C0 too:
In [28]: re.findall(r'\w?\d+(?:\.\d+)?', test)
Out[28]: ['30098.904999', '5', '8', '06', '01', '20', '00', '80', '11', 'C0', '04']
You don't need to use split as you can use \s+ to match 1 or more whitespace
Your regex also needs correction.
You can use this:
(\d+\.\d+)\s+(\d+)\s+ABC\s+Da\s+d\s+8\s+(\d{2})\s+(\d{2})\s+(\d{2})\s+(\d{2})\s+(\d{2})\s+(\d{2})\s+([A-Z]\d)\s+(\d{2})
RegEx Demo
I'm trying to scrape the content from this URL which contains multiple tables. The desired output would be:
NAME FG% FT% 3PM REB AST STL BLK TO PTS SCORE
Team Jackson (0-8) .4313 .7500 21 71 34 11 12 15 189 1-8-0
Team Keyrouze (4-4) .4441 .8090 31 130 71 18 13 45 373 8-1-0
Nutz Vs. Draymond Green (4-4) .4292 .8769 30 86 66 15 9 28 269 3-6-0
Team Pauls 2 da Wall (3-5) .4784 .8438 40 123 64 18 20 30 316 6-3-0
Team Noey (2-6) .4350 .7679 21 125 62 20 9 33 278 7-2-0
YOU REACH, I TEACH (2-5-1) .4810 .7432 20 114 56 30 7 50 277 2-7-0
Kris Kaman His Pants (5-3) .4328 .8000 20 74 59 20 5 27 238 3-6-0
Duke's Balls In Daniels Face (3-4-1) .5000 .7045 42 139 38 27 22 30 303 6-3-0
Knicks Tape (5-3) .5000 .8152 34 143 92 12 9 47 397 4-5-0
Suck MyDirk (5-3) .4734 .8814 29 106 86 22 17 40 435 5-4-0
In Porzingod We Trust (4-4) .4928 .7222 27 180 95 16 16 46 423 7-2-0
Team Aguilar (6-1-1) .4718 .7053 28 177 65 12 35 48 413 2-7-0
Team Li (7-0-1) .4714 .8118 35 134 74 17 17 47 368 6-3-0
Team Iannetta (4-4) .4527 .7302 22 125 90 20 13 44 288 3-6-0
If it's too difficult to format the tables like that, I'd like to know how I can scrape all the tables? My code to scrape all rows is like this:
tableStats = soup.find('table', {'class': 'tableBody'})
rows = tableStats.findAll('tr')
for row in rows:
print(row.string)
But it only prints the value "TEAM" and nothing else... Why doesn't it contain all the rows in the table?
Thanks.
Instead of looking for the table tag, you should look for the rows directly with a more dependable class, such as linescoreTeamRow. This code snippet does the trick,
from bs4 import BeautifulSoup
import requests
a = requests.get("http://games.espn.com/fba/scoreboard?leagueId=224165&seasonId=2017")
soup = BeautifulSoup(a.text, 'lxml')
# searching for the rows directly
rows = soup.findAll('tr', {'class': 'linescoreTeamRow'})
# you will need to isolate elements in the row for the table
for row in rows:
print row.text
Found a way to exactly get the 2-D matrix I specified in the question. It's stored as the list teams.
Code:
from bs4 import BeautifulSoup
import requests
source_code = requests.get("http://games.espn.com/fba/scoreboard?leagueId=224165&seasonId=2017")
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
teams = []
rows = soup.findAll('tr', {'class': 'linescoreTeamRow'})
# Creates a 2-D matrix.
for row in range(len(rows)):
team_row = []
columns = rows[row].findAll('td')
for column in columns:
team_row.append(column.getText())
print(team_row)
# Add each team to a teams matrix.
teams.append(team_row)
Output:
['Team Jackson (0-10)', '', '.4510', '.8375', '41', '135', '101', '23', '11', '50', '384', '', '5-4-0']
['YOU REACH, I TEACH (3-6-1)', '', '.4684', '.7907', '22', '169', '103', '22', '10', '32', '342', '', '4-5-0']
['Nutz Vs. Draymond Green (4-6)', '', '.4552', '.8372', '30', '157', '68', '15', '16', '39', '356', '', '2-7-0']
["Jesse's Blue Balls (4-5-1)", '', '.4609', '.7576', '47', '158', '71', '30', '20', '38', '333', '', '7-2-0']
['Team Noey (4-6)', '', '.4763', '.8261', '42', '164', '70', '25', '29', '44', '480', '', '5-4-0']
['Suck MyDirk (6-3-1)', '', '.4733', '.8403', '54', '160', '132', '23', '11', '47', '544', '', '4-5-0']
['Kris Kaman His Pants (5-5)', '', '.4569', '.8732', '53', '138', '105', '27', '21', '53', '465', '', '6-3-0']
['Team Aguilar (6-3-1)', '', '.4433', '.7229', '40', '202', '68', '30', '22', '54', '452', '', '3-6-0']
['Knicks Tape (6-3-1)', '', '.4406', '.8824', '52', '172', '108', '24', '13', '49', '513', '', '6-3-0']
['Team Iannetta (4-6)', '', '.5321', '.6923', '24', '146', '94', '32', '16', '60', '428', '', '3-6-0']
['In Porzingod We Trust (6-4)', '', '.4694', '.6364', '37', '216', '133', '31', '21', '77', '468', '', '4-5-0']
['Team Keyrouze (6-4)', '', '.4705', '.8854', '51', '135', '108', '25', '17', '43', '550', '', '5-4-0']
['Team Li (8-1-1)', '', '.4369', '.8182', '57', '203', '130', '34', '22', '54', '525', '', '6-3-0']
['Team Pauls 2 da Wall (5-5)', '', '.4780', '.5970', '27', '141', '47', '19', '25', '28', '263', '', '3-6-0']