Text to column task in Python - python

;
;
ACHTUNG;Dies ist das Ergebnis einer Testversion. Alle Ergebnisse ohne Gewaehr.
;Bei Rueckfragen oder Unstimmigkeiten wenden Sie sich an aron.proebsting#mwtest.de;
;
;
;
PSD4_Status;|;
PSD5_Status;|;
mux;<-;PSD6_CAN;PSD6_Status;
PSD6_Status;|;
cycle_state;<-;PSD6_Status;PSD5_Status;PSD4_Status;
PsdEhr_out;<-;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
Entfernung_Abzweigung;<-;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.id;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelle_Pos.length;Child_Segment.geometry.curvatureStart;Child_Segment.geometry.curvatureEnd;Child_Segment.geometry.branchAngle;Child_Segment.attributes.lanes;Child_Segment.attributes.streetClass;Child_Segment.attributes.ramp;Child_Segment.attributes.isMostProbablePath;Child_Segment.attributes.isStraightestPath;Child_Segment.attributes.isADASQuality;Child_Segment.attributes.isBuiltUpArea;Child_Segment.attributeIndex;Child_Segment.speedLimitIndex;Child_Segment.id;Child_Segment.parentId;Child_Segment.identity;Child_Segment.completeFlags;Child_Segment.childSegments[0];Child_Segment.childSegments[1];Child_Segment.childSegments[2];Child_Segment.childSegments[3];Child_Segment.childSegments[4];Get_Child_It.indexStart;Get_Child_It.indexCurrent;Get_Child_It.id;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.length;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isADASQuality;Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Child_from_Parent.geometry.curvatureStart;Child_from_Parent.geometry.curvatureEnd;Child_from_Parent.geometry.length;Child_from_Parent.geometry.branchAngle;Child_from_Parent.attributes.lanes;Child_from_Parent.attributes.ramp;Child_from_Parent.attributes.isMostProbablePath;Child_from_Parent.attributes.isStraightestPath;Child_from_Parent.attributes.isADASQuality;Child_from_Parent.attributes.isBuiltUpArea;Child_from_Parent.attributeIndex;Child_from_Parent.speedLimitIndex;Child_from_Parent.id;Child_from_Parent.parentId;Child_from_Parent.identity;Child_from_Parent.completeFlags;Child_from_Parent.childSegments[0];Child_from_Parent.childSegments[1];Child_from_Parent.childSegments[2];Child_from_Parent.childSegments[3];Child_from_Parent.childSegments[4];Min_Strassenklasse;Aktuelles_Segment.childSegments[0];Child_Segment.geometry.length;Child_from_Parent.attributes.streetClass;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
Steigung_gueltig;<-;Aktuelle_Pos.length;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isADASQuality;Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[0];Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Aktuelle_Pos.id;Suchweite;Steigung_innerhalb_Suchweite.distance;Steigung_innerhalb_Suchweite.attribute.nextAttribute;Steigung_innerhalb_Suchweite.attribute.offset;Steigung_innerhalb_Suchweite.attribute.type;Steigung_innerhalb_Suchweite.segmentId;Steigung_innerhalb_Suchweite_It.searchDistance;Steigung_innerhalb_Suchweite_It.currentIndex;Steigung_innerhalb_Suchweite_It.currentDistance;Steigung_innerhalb_Suchweite_It.searchType;Steigung_innerhalb_Suchweite_It.searchDirection;Steigung_innerhalb_Suchweite_It.currentId;Steigung_innerhalb_Suchweite_It.currentOffset;Steigung_innerhalb_Suchweite.attribute.value;Aktuelles_Segment.geometry.length;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
Kruemmung_gueltig;<-;Aktuelle_Pos.length;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.length;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isADASQuality;Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[0];Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Aktuelle_Pos.id;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
BuiltUpArea;<-;Aktuelle_Pos.length;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.length;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isADASQuality;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[0];Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelle_Pos.id;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
ADASQuality;<-;Aktuelle_Pos.length;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.length;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[0];Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Aktuelles_Segment.attributes.isADASQuality;Aktuelle_Pos.id;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
NumberOfChilds;<-;Aktuelle_Pos.length;Aktuelle_Pos.inhibitTime;Aktuelle_Pos.lane;Aktuelle_Pos.longitudinalError;Aktuelle_Pos.isLocationUnique;Aktuelles_Segment.geometry.curvatureStart;Aktuelles_Segment.geometry.curvatureEnd;Aktuelles_Segment.geometry.length;Aktuelles_Segment.geometry.branchAngle;Aktuelles_Segment.attributes.lanes;Aktuelles_Segment.attributes.streetClass;Aktuelles_Segment.attributes.ramp;Aktuelles_Segment.attributes.isMostProbablePath;Aktuelles_Segment.attributes.isStraightestPath;Aktuelles_Segment.attributes.isADASQuality;Aktuelles_Segment.attributes.isBuiltUpArea;Aktuelles_Segment.attributeIndex;Aktuelles_Segment.speedLimitIndex;Aktuelles_Segment.id;Aktuelles_Segment.parentId;Aktuelles_Segment.identity;Aktuelles_Segment.completeFlags;Aktuelles_Segment.childSegments[1];Aktuelles_Segment.childSegments[2];Aktuelles_Segment.childSegments[3];Aktuelles_Segment.childSegments[4];Child_from_Parent.geometry.curvatureStart;Child_from_Parent.geometry.curvatureEnd;Child_from_Parent.geometry.length;Child_from_Parent.geometry.branchAngle;Child_from_Parent.attributes.lanes;Child_from_Parent.attributes.ramp;Child_from_Parent.attributes.isMostProbablePath;Child_from_Parent.attributes.isStraightestPath;Child_from_Parent.attributes.isADASQuality;Child_from_Parent.attributes.isBuiltUpArea;Child_from_Parent.attributeIndex;Child_from_Parent.speedLimitIndex;Child_from_Parent.id;Child_from_Parent.parentId;Child_from_Parent.identity;Child_from_Parent.completeFlags;Child_from_Parent.childSegments[0];Child_from_Parent.childSegments[1];Child_from_Parent.childSegments[2];Child_from_Parent.childSegments[3];Child_from_Parent.childSegments[4];Min_Strassenklasse;Aktuelles_Segment.childSegments[0];Child_Segment.geometry.curvatureStart;Child_Segment.geometry.curvatureEnd;Child_Segment.geometry.length;Child_Segment.geometry.branchAngle;Child_Segment.attributes.lanes;Child_Segment.attributes.streetClass;Child_Segment.attributes.ramp;Child_Segment.attributes.isMostProbablePath;Child_Segment.attributes.isStraightestPath;Child_Segment.attributes.isADASQuality;Child_Segment.attributes.isBuiltUpArea;Child_Segment.attributeIndex;Child_Segment.speedLimitIndex;Child_Segment.parentId;Child_Segment.identity;Child_Segment.completeFlags;Child_Segment.childSegments[0];Child_Segment.childSegments[1];Child_Segment.childSegments[2];Child_Segment.childSegments[3];Child_Segment.childSegments[4];Get_Child_It.indexStart;Get_Child_It.indexCurrent;Get_Child_It.id;Child_Segment.id;Child_from_Parent.attributes.streetClass;Aktuelle_Pos.id;PsdEhr_ProcessMessageCycle();PSD6_CAN;PSD6_Status;PSD5_Status;cycle_state;PSD4_Status;
This is how my csv file currently looks. I want to create the file like we do in excel text to column with ; seperator. I cannot do it in excel as I want to automize this process because there are many files like this. I am new to python so not sure how to proceed ahead. Some suggestion would be really helpful.

Do you have to keep every row in your csv file? This will be a slight problem because you do not have enough delimiters per row to account for each column. This code will open your file, check how many delimiters each row needs, add the appropriate number of delimiters, save the new csv file with those delimiters, then open the new csv file using Pandas csv_read:
import pandas as pd
path = "Text.csv"
text = [f for f in open(path)]
# Find the maximum number of delimiters (;) in any given row
numDelims = []
for line in text:
count = line.count(';')
numDelims.append(count)
maxDelims = np.max(numDelims)
# Add the missing number of delimiters to each row to account for the columns
for x in range(len(text)):
text[x] = text[x].replace("\n", ";"*(maxDelims-numDelims[x])+"\n")
# Save the new csv file with all the additional delimters
newFile = "Save.csv"
# Save to a new text file
with open(newFile, "w+") as file:
file.writelines(text)
# Read the file back in as a pandas dataframe
df = pd.read_csv("Save.csv", sep=";")
df

You can try to read your file using pandas (pandas.read_csv), such as:
import pandas as pd
pd.read_csv('pathofyourfile', sep=';')

You can use read_csv with the sep and skiprows parameters:
import pandas as pd
df = pd.read_csv('test.csv', sep=';', skiprows=13)
print(df)
Output:
Entfernung_Abzweigung <- Aktuelle_Pos.inhibitTime \
0 Steigung_gueltig <- Aktuelle_Pos.length
1 Kruemmung_gueltig <- Aktuelle_Pos.length
2 BuiltUpArea <- Aktuelle_Pos.length
3 ADASQuality <- Aktuelle_Pos.length
4 NumberOfChilds <- Aktuelle_Pos.length
Aktuelle_Pos.id Aktuelle_Pos.lane \
0 Aktuelle_Pos.inhibitTime Aktuelle_Pos.lane
1 Aktuelle_Pos.inhibitTime Aktuelle_Pos.lane
2 Aktuelle_Pos.inhibitTime Aktuelle_Pos.lane
3 Aktuelle_Pos.inhibitTime Aktuelle_Pos.lane
4 Aktuelle_Pos.inhibitTime Aktuelle_Pos.lane
Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique \
0 Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique
1 Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique
2 Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique
3 Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique
4 Aktuelle_Pos.longitudinalError Aktuelle_Pos.isLocationUnique
Aktuelle_Pos.length \
0 Aktuelles_Segment.geometry.curvatureStart
1 Aktuelles_Segment.geometry.curvatureStart
2 Aktuelles_Segment.geometry.curvatureStart
3 Aktuelles_Segment.geometry.curvatureStart
4 Aktuelles_Segment.geometry.curvatureStart
Child_Segment.geometry.curvatureStart \
0 Aktuelles_Segment.geometry.curvatureEnd
1 Aktuelles_Segment.geometry.curvatureEnd
2 Aktuelles_Segment.geometry.curvatureEnd
3 Aktuelles_Segment.geometry.curvatureEnd
4 Aktuelles_Segment.geometry.curvatureEnd
Child_Segment.geometry.curvatureEnd ... \
0 Aktuelles_Segment.geometry.branchAngle ...
1 Aktuelles_Segment.geometry.length ...
2 Aktuelles_Segment.geometry.length ...
3 Aktuelles_Segment.geometry.length ...
4 Aktuelles_Segment.geometry.length ...
Aktuelles_Segment.childSegments[0] \
0 NaN
1 NaN
2 NaN
3 NaN
4 Child_Segment.id
Child_Segment.geometry.length \
0 NaN
1 NaN
2 NaN
3 NaN
4 Child_from_Parent.attributes.streetClass
Child_from_Parent.attributes.streetClass PsdEhr_ProcessMessageCycle() \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 Aktuelle_Pos.id PsdEhr_ProcessMessageCycle()
PSD6_CAN PSD6_Status PSD5_Status cycle_state PSD4_Status Unnamed: 84
0 NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN
4 PSD6_CAN PSD6_Status PSD5_Status cycle_state PSD4_Status NaN

Related

How to multiply different columns in different dataframes using Pandas

I have 2 dataframes that I want to multiply. I want to multiply multiple columns from dataframe 1 with one column in dataframe 2
raw_material_LCI = dataframe1[["climate change","ozone depletion",
"ionising radiation, hh","photochemical ozone formation, hh",
"particulate matter","human toxicity, non-cancer",
"human toxicity, cancer","acidification",
"eutrophication, freshwater","eutrophication, marine",
"eutrophication, terrestrial","ecotoxicity, freshwater",
"land use", "resource use, fossils","resource use, minerals and metals",
"water scarcity"]] * dataframe2["mass_frac"]
The above code returns a dataframe where all the values are NaN. The names of the columns all are fields with numeric values in them.
I decided to try multiply dataframe1 with just a single value to see if it worked e.g. example below
raw_material_LCI = dataframe1[["climate change","ozone depletion",
"ionising radiation, hh","photochemical ozone formation, hh",
"particulate matter","human toxicity, non-cancer",
"human toxicity, cancer","acidification",
"eutrophication, freshwater","eutrophication, marine",
"eutrophication, terrestrial","ecotoxicity, freshwater",
"land use", "resource use, fossils","resource use, minerals and metals",
"water scarcity"]] * 0.7
The example with the single value returns a dataframe with numbers, so it works. Does anyone know why the multiplication in the first instance does not work? I have looked at multiple articles on multiplying columns in different dataframes in Python and cannot find a solution.
You have to align both row and column indexes when you multiply two dataframes and align row index when you multiply a DataFrame by a Series:
>>> df
A B C D E
0 0.787081 0.350508 0.058542 0.492340 0.489379
1 0.512436 0.501375 0.108115 0.960808 0.841969
2 0.055247 0.305830 0.976043 0.016188 0.006424
3 0.303570 0.914876 0.157100 0.767454 0.340381
4 0.446077 0.595001 0.307799 0.115410 0.568281
5 0.226516 0.636902 0.086790 0.079260 0.402414
6 0.451920 0.526025 0.012470 0.931610 0.267155
7 0.472778 0.137005 0.227569 0.941355 0.584782
8 0.944396 0.769115 0.497214 0.531419 0.570797
9 0.788023 0.310288 0.336480 0.585466 0.432246
>>> sr
0 0.920878
1 0.445332
2 0.894407
3 0.613317
4 0.242270
5 0.299121
6 0.843052
7 0.279014
8 0.526778
9 0.249538
dtype: float64
So, this produces nan values:
>>> df * sr
A B C D E
0 0.724805 0.322775 0.053910 0.453385 0.450658
1 0.228204 0.223279 0.048147 0.427878 0.374956
2 0.049413 0.273536 0.872980 0.014479 0.005745
3 0.186185 0.561109 0.096352 0.470693 0.208762
4 0.108071 0.144151 0.074571 0.027961 0.137678
5 0.067756 0.190511 0.025961 0.023708 0.120371
6 0.380992 0.443466 0.010513 0.785396 0.225226
7 0.131912 0.038226 0.063495 0.262651 0.163162
8 0.497487 0.405153 0.261921 0.279940 0.300683
9 0.196642 0.077429 0.083965 0.146096 0.107862
but using mul along index axis works as expected:
>>> df.mul(sr, axis=0) # but not df.mul(sr) (same as df*sr)
A B C D E
0 0.724805 0.322775 0.053910 0.453385 0.450658
1 0.228204 0.223279 0.048147 0.427878 0.374956
2 0.049413 0.273536 0.872980 0.014479 0.005745
3 0.186185 0.561109 0.096352 0.470693 0.208762
4 0.108071 0.144151 0.074571 0.027961 0.137678
5 0.067756 0.190511 0.025961 0.023708 0.120371
6 0.380992 0.443466 0.010513 0.785396 0.225226
7 0.131912 0.038226 0.063495 0.262651 0.163162
8 0.497487 0.405153 0.261921 0.279940 0.300683
9 0.196642 0.077429 0.083965 0.146096 0.107862
If your series and dataframe have not the same length, you get a partial result:
>>> df.mul(sr.iloc[:5], axis=0)
A B C D E
0 0.724805 0.322775 0.053910 0.453385 0.450658
1 0.228204 0.223279 0.048147 0.427878 0.374956
2 0.049413 0.273536 0.872980 0.014479 0.005745
3 0.186185 0.561109 0.096352 0.470693 0.208762
4 0.108071 0.144151 0.074571 0.027961 0.137678
5 NaN NaN NaN NaN NaN
6 NaN NaN NaN NaN NaN
7 NaN NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN
9 NaN NaN NaN NaN NaN
>>> df.mul(sr.iloc[5:], axis=0)
A B C D E
0 NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN
5 0.067756 0.190511 0.025961 0.023708 0.120371
6 0.380992 0.443466 0.010513 0.785396 0.225226
7 0.131912 0.038226 0.063495 0.262651 0.163162
8 0.497487 0.405153 0.261921 0.279940 0.300683
9 0.196642 0.077429 0.083965 0.146096 0.107862
Take care to have the same index between instances.

Pandas dataframe merge row by addition

I want to create a dataframe from census data. I want to calculate the number of people that returned a tax return for each specific earnings group.
For now, I wrote this
census_df = pd.read_csv('../zip code data/19zpallagi.csv')
sub_census_df = census_df[['zipcode', 'agi_stub', 'N02650', 'A02650', 'ELDERLY', 'A07180']].copy()
num_of_returns = ['Number_of_returns_1_25000', 'Number_of_returns_25000_50000', 'Number_of_returns_50000_75000',
'Number_of_returns_75000_100000', 'Number_of_returns_100000_200000', 'Number_of_returns_200000_more']
for i, column_name in zip(range(1, 7), num_of_returns):
sub_census_df[column_name] = sub_census_df[sub_census_df['agi_stub'] == i]['N02650']
I have 6 groups attached to a specific zip code. I want to get one row, with the number of returns for a specific zip code appearing just once as a column. I already tried to change NaNs to 0 and to use groupby('zipcode').sum(), but I get 50 million rows summed for zip code 0, where it seems that only around 800k should exist.
Here is the dataframe that I currently get:
zipcode agi_stub N02650 A02650 ELDERLY A07180 Number_of_returns_1_25000 Number_of_returns_25000_50000 Number_of_returns_50000_75000 Number_of_returns_75000_100000 Number_of_returns_100000_200000 Number_of_returns_200000_more Amount_1_25000 Amount_25000_50000 Amount_50000_75000 Amount_75000_100000 Amount_100000_200000 Amount_200000_more
0 0 1 778140.0 10311099.0 144610.0 2076.0 778140.0 NaN NaN NaN NaN NaN 10311099.0 NaN NaN NaN NaN NaN
1 0 2 525940.0 19145621.0 113810.0 17784.0 NaN 525940.0 NaN NaN NaN NaN NaN 19145621.0 NaN NaN NaN NaN
2 0 3 285700.0 17690402.0 82410.0 9521.0 NaN NaN 285700.0 NaN NaN NaN NaN NaN 17690402.0 NaN NaN NaN
3 0 4 179070.0 15670456.0 57970.0 8072.0 NaN NaN NaN 179070.0 NaN NaN NaN NaN NaN 15670456.0 NaN NaN
4 0 5 257010.0 35286228.0 85030.0 14872.0 NaN NaN NaN NaN 257010.0 NaN NaN NaN NaN NaN 35286228.0 NaN
And here is what I want to get:
zipcode Number_of_returns_1_25000 Number_of_returns_25000_50000 Number_of_returns_50000_75000 Number_of_returns_75000_100000 Number_of_returns_100000_200000 Number_of_returns_200000_more
0 0 778140.0 525940.0 285700.0 179070.0 257010.0 850.0
here is one way to do it using groupby and sum the desired columns
num_of_returns = ['Number_of_returns_1_25000', 'Number_of_returns_25000_50000', 'Number_of_returns_50000_75000',
'Number_of_returns_75000_100000', 'Number_of_returns_100000_200000', 'Number_of_returns_200000_more']
df.groupby('zipcode', as_index=False)[num_of_returns].sum()
zipcode Number_of_returns_1_25000 Number_of_returns_25000_50000 Number_of_returns_50000_75000 Number_of_returns_75000_100000 Number_of_returns_100000_200000 Number_of_returns_200000_more
0 0 778140.0 525940.0 285700.0 179070.0 257010.0 0.0
This question needs more information to actually give a proper answer. For example you leave out what is meant by certain columns in your data frame:
- `N1: Number of returns`
- `agi_stub: Size of adjusted gross income`
According to IRS this has the following levels.
Size of adjusted gross income "0 = No AGI Stub
1 = ‘Under $1’
2 = '$1 under $10,000'
3 = '$10,000 under $25,000'
4 = '$25,000 under $50,000'
5 = '$50,000 under $75,000'
6 = '$75,000 under $100,000'
7 = '$100,000 under $200,000'
8 = ‘$200,000 under $500,000’
9 = ‘$500,000 under $1,000,000’
10 = ‘$1,000,000 or more’"
I got the above from https://www.irs.gov/pub/irs-soi/16incmdocguide.doc
With this information, I think what you want to find is the number of
people who filed a tax return for each of the income levels of agi_stub.
If that is what you mean then, this can be achieved by:
import pandas as pd
data = pd.read_csv("./data/19zpallagi.csv")
## select only the desired columns
data = data[['zipcode', 'agi_stub', 'N1']]
## solution to your problem?
df = data.pivot_table(
index='zipcode',
values='N1',
columns='agi_stub',
aggfunc=['sum']
)
## bit of cleaning up.
PREFIX = 'agi_stub_level_'
df.columns = [PREFIX + level for level in df.columns.get_level_values(1).astype(str)]
Here's the output.
In [77]: df
Out[77]:
agi_stub_level_1 agi_stub_level_2 ... agi_stub_level_5 agi_stub_level_6
zipcode ...
0 50061850.0 37566510.0 ... 21938920.0 8859370.0
1001 2550.0 2230.0 ... 1420.0 230.0
1002 2850.0 1830.0 ... 1840.0 990.0
1005 650.0 570.0 ... 450.0 60.0
1007 1980.0 1530.0 ... 1830.0 460.0
... ... ... ... ... ...
99827 470.0 360.0 ... 170.0 40.0
99833 550.0 380.0 ... 290.0 80.0
99835 1250.0 1130.0 ... 730.0 190.0
99901 1960.0 1520.0 ... 1030.0 290.0
99999 868450.0 644160.0 ... 319880.0 142960.0
[27595 rows x 6 columns]

Is there a possibility to use a bigger List in phython?

For school I have to make a project about wifisignals and I am trying put the data in a dataframe.
There are 208.000 rows of data.
And when it comes to the code below, the code does not complete. The code is like it is stuck in an infinite loop.
But when I use only a 1000 rows my program works. So I think that my list are to small if that is possible.
Do bigger Lists exist in phython? Or is it because I use bad coding?
Thanks in advance.
edit 1:
(data is the original dataframe and wifiinfo is a column of that)
i have this format:
df = pd.DataFrame(columns=['Sender','Time','Date','Place','X','Y','Bezetting','SSID','BSSID','Signal'])
And i am trying to fill SSID, BSSID and Signal from the Column WifiInfo for this i have to split the data.
this is how 1 WifiInfo looks like:
ODISEE#88-1d-fc-41-dc-50:-83,ODISEE#88-1d-fc-2c-c0-00:-72,ODISEE#88-1d-fc-41-d2-d0:-82,CiscoC5976#58-6d-8f-19-14-38:-78,CiscoC5959#58-6d-8f-19-13-f4:-93,SNB#c8-d7-19-6f-be-b7:-99,ODISEE#88-1d-fc-2c-c5-70:-94,HackingDemo#58-6d-8f-19-11-48:-156,ODISEE#88-1d-fc-30-d4-40:-85,ODISEE#88-1d-fc-41-ac-50:-100
My current approach looks like:
for index, row in data.iterrows():
bezettingList = list()
ssidList = list()
bssidList = list()
signalList = list()
#WifiInfo splitting
wifis = row.WifiInfo.split(',')
for wifi in wifis:
#split wifi and add to List
ssid, bssid = wifi.split('#')
bssid, signal = bssid.split(':')
ssidList.append(ssid)
bssidList.append(bssid)
signalList.append(int(signal))
#add bezettingen to List
bezettingen = row.Bezetting.split(',')
for bezetting in bezettingen:
bezettingList.append(bezetting)
#add list to dataframe
df.loc[index,'SSID'] = ssidList
df.loc[index,'BSSID'] = bssidList
df.loc[index,'Signal'] = signalList
df.loc[index,'Bezetting'] = bezettingList
df.head()
IIUC, you need to first explode the row by commas so that this:
SSID BSSID Signal WifiInfo
0 NaN NaN NaN ODISEE#88-1d-fc-41-dc-50:-83,ODISEE#88- ...
becomes this:
SSID BSSID Signal WifiInfo
0 NaN NaN NaN ODISEE#88-1d-fc-41-dc-50:-83
1 NaN NaN NaN ODISEE#88-1d-fc-2c-c0-00:-72
2 NaN NaN NaN ODISEE#88-1d-fc-41-d2-d0:-82
3 NaN NaN NaN CiscoC5976#58-6d-8f-19-14-38:-78
4 NaN NaN NaN CiscoC5959#58-6d-8f-19-13-f4:-93
5 NaN NaN NaN SNB#c8-d7-19-6f-be-b7:-99
6 NaN NaN NaN ODISEE#88-1d-fc-2c-c5-70:-94
7 NaN NaN NaN HackingDemo#58-6d-8f-19-11-48:-156
8 NaN NaN NaN ODISEE#88-1d-fc-30-d4-40:-85
9 NaN NaN NaN ODISEE#88-1d-fc-41-ac-50:-100
# use `.explode`
data = data.assign(WifiInfo=data.WifiInfo.str.split(',')).explode('WifiInfo')
Now you could use .str.extract:
data['SSID'] = data['WifiInfo'].str.extract(r'(.*)#')
data['BSSID'] = data['WifiInfo'].str.extract(r'#(.*):')
data['Signal'] = data['WifiInfo'].str.extract(r':(.*)')
SSID BSSID Signal WifiInfo
0 ODISEE 88-1d-fc-41-dc-50 -83 ODISEE#88-1d-fc-41-dc-50:-83
1 ODISEE 88-1d-fc-2c-c0-00 -72 ODISEE#88-1d-fc-2c-c0-00:-72
2 ODISEE 88-1d-fc-41-d2-d0 -82 ODISEE#88-1d-fc-41-d2-d0:-82
3 CiscoC5976 58-6d-8f-19-14-38 -78 CiscoC5976#58-6d-8f-19-14-38:-78
4 CiscoC5959 58-6d-8f-19-13-f4 -93 CiscoC5959#58-6d-8f-19-13-f4:-93
5 SNB c8-d7-19-6f-be-b7 -99 SNB#c8-d7-19-6f-be-b7:-99
6 ODISEE 88-1d-fc-2c-c5-70 -94 ODISEE#88-1d-fc-2c-c5-70:-94
7 HackingDemo 58-6d-8f-19-11-48 -156 HackingDemo#58-6d-8f-19-11-48:-156
8 ODISEE 88-1d-fc-30-d4-40 -85 ODISEE#88-1d-fc-30-d4-40:-85
9 ODISEE 88-1d-fc-41-ac-50 -100 ODISEE#88-1d-fc-41-ac-50:-100
If you want to keep data grouped after column explosion, I'd assign an ID for each group of entries first:
data['Group'] = pd.factorize(data['WifiInfo'])[0]+1
SSID BSSID Signal WifiInfo Group
0 NaN NaN NaN ODISEE#88-1d-fc-41-dc-50:-83,ODISEE#88- ... 1
1 NaN NaN NaN ASD#22-1d-fc-41-dc-50:-83,QWERTY#88- ... 2
# after you explode the column
SSID BSSID Signal WifiInfo Group
ODISEE 88-1d-fc-41-dc-50 -83 ODISEE#88-1d-fc-41-dc-50:-83 1
ODISEE 88-1d-fc-2c-c0-00 -72 ODISEE#88-1d-fc-2c-c0-00:-72 1
...
...
ASD 22-1d-fc-41-dc-50 -83 ASD#88-1d-fc-41-dc-50:-83 2
QWERTY 88-1d-fc-2c-c0-00 -72 QWERTY#88-1d-fc-2c-c0-00:-72 2

How to keep rows by string in .gff file in python

I have a gff file and I extract only the 'attributes' information.
ID=id0;Dbxref=taxon:471472;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;serotype=L2;strain=434/Bu
ID=gene0;Dbxref=GeneID:5858769;Name=hemB;gbkey=Gene;gene=hemB;gene_biotype=protein_coding;locus_tag=CTL0001
"ID=cds0;Parent=gene0;Dbxref=Genbank:YP_001654092.1,GeneID:5858769;Name=YP_001654092.1;Note=catalyzes the formation of porphobilinogen from 5-aminolevulinate;gbkey=CDS;gene=hemB;product=delta-aminolevulinic acid dehydratase;protein_id=YP_001654092.1;transl_table=11"
ID=id1;Dbxref=GeneID:5858769;Note=PS00169 Delta-aminolevulinic acid dehydratase active site.;gbkey=misc_feature;gene=hemB;inference=protein motif:Prosite:PS00169
ID=gene1;Dbxref=GeneID:5857942;Name=nqrA;gbkey=Gene;gene=nqrA;gene_biotype=protein_coding;locus_tag=CTL0002
"ID=cds1;Parent=gene1;Dbxref=Genbank:YP_001654093.1,GeneID:5857942;Name=YP_001654093.1;Note=uses the energy from reduction of ubiquinone-1 to ubiquinol to move Na(+) ions from the cytoplasm to the periplasm;gbkey=CDS;gene=nqrA;product=Na(+)-translocating NADH-quinone reductase subunit A;protein_id=YP_001654093.1;transl_table=11"
ID=gene2;Dbxref=GeneID:5858572;Name=CTL0003;gbkey=Gene;gene_biotype=protein_coding;locus_tag=CTL0003
"ID=cds2;Parent=gene2;Dbxref=Genbank:YP_001654094.1,GeneID:5858572;Name=YP_001654094.1;gbkey=CDS;product=hypothetical protein;protein_id=YP_001654094.1;transl_table=11"
I convert it into a csv file to handle it with dataframe in python
fn = pd.read_table("D:/J/gff.csv",sep=';',
names=["a", "b", "c", "d","e","f","g","h","i"])
df = pd.DataFrame(fn)
a b \
0 ID=id0 Dbxref=taxon:471472
1 ID=gene0 Dbxref=GeneID:5858769
2 ID=cds0;Parent=gene0;Dbxref=Genbank:YP_0016540... NaN
3 ID=id1 Dbxref=GeneID:5858769
4 ID=gene1 Dbxref=GeneID:5857942
c d \
0 Is_circular=true Name=ANONYMOUS
1 Name=hemB gbkey=Gene
2 NaN NaN
3 Note=PS00169 Delta-aminolevulinic acid dehydra... gbkey=misc_feature
4 Name=nqrA gbkey=Gene
e f g \
0 gbkey=Src genome=chromosome mol_type=genomic DNA
1 gene=hemB gene_biotype=protein_coding locus_tag=CTL0001
2 NaN NaN NaN
3 gene=hemB inference=protein motif:Prosite:PS00169 NaN
4 gene=nqrA gene_biotype=protein_coding locus_tag=CTL0002
h i
0 serotype=L2 strain=434/Bu
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
Now I want to extract the rows that ID only contains 'geneX'(X could be different numbers). I tried to use
df = df[df['a'].str.contains(['ID=gene'])]
But it gives an error
TypeError: unhashable type: 'list'
I checked dtypes of all columns are object. And I want to select those rows by string pattern with 'ID=geneX'.
So is it possible to have a dataframe like this,
ID Name locus_tag ..
gene0 hemB CTL0001
gene1 nqrA CTL0002
..

Pandas : How to calculate PCT Change for all columns dynamically?

I got the following pandas df by using the following command, how to get PCT Change for all the columns dynamically for AAL , AAN ... 100 more
price['AABA_PCT_CHG'] = price.AABA.pct_change()
AABA AAL AAN AABA_PCT_CHG
0 16.120001 9.635592 18.836105 NaN
1 16.400000 8.363149 23.105881 0.017370
2 16.680000 8.460282 24.892321 0.017073
3 17.700001 8.829385 28.275263 0.061151
4 16.549999 8.839100 27.705627 -0.064972
5 15.040000 8.654548 27.754738 -0.091239
Apply on dataframe like
In [424]: price.pct_change().add_suffix('_PCT_CHG')
Out[424]:
AABA_PCT_CHG AAL_PCT_CHG AAN_PCT_CHG
0 NaN NaN NaN
1 0.017370 -0.132057 0.226680
2 0.017073 0.011614 0.077315
3 0.061151 0.043628 0.135903
4 -0.064972 0.001100 -0.020146
5 -0.091239 -0.020879 0.001773
In [425]: price.join(price.pct_change().add_suffix('_PCT_CHG'))
Out[425]:
AABA AAL AAN AABA_PCT_CHG AAL_PCT_CHG AAN_PCT_CHG
0 16.120001 9.635592 18.836105 NaN NaN NaN
1 16.400000 8.363149 23.105881 0.017370 -0.132057 0.226680
2 16.680000 8.460282 24.892321 0.017073 0.011614 0.077315
3 17.700001 8.829385 28.275263 0.061151 0.043628 0.135903
4 16.549999 8.839100 27.705627 -0.064972 0.001100 -0.020146
5 15.040000 8.654548 27.754738 -0.091239 -0.020879 0.001773

Categories

Resources