I have the output of my script as year and the count of word from an article in that particular year :
abcd
2013
118
2014
23
xyz
2013
1
2014
45
I want to have each year added as a new column to my existing dataframe which contains only words.
Expected output:
Terms 2013 2014 2015
abc 118 76 90
xyz 23 0 36
The input for my script was a csv file :
Terms
xyz
abc
efg
The script I wrote is :
df = pd.read_csv('a.csv', header = None)
for row in df.itertuples():
term = (str(row[1]))
u = "http: term=%s&mindate=%d/01/01&maxdate=%d/12/31"
print(term)
startYear = 2013
endYear = 2018
for year in range(startYear, endYear+1):
url = u % (term.replace(" ", "+"), year, year)
page = urllib.request.urlopen(url).read()
doc = ET.XML(page)
count = doc.find("Count").text
print(year)
print(count)
The df.head is :
0
0 1,2,3-triazole
1 16s rrna gene amplicons
Any help will be greatly appreciated, thanks in advance !!
I would read the csv with numpy in an array, then reshape it also with numpy and then the resulting matrix/2D array to a DataFrame
Something like this should do it:
#!/usr/bin/env python
def mkdf(filename):
def combine(term, l):
d = {"term": term}
d.update(dict(zip(l[::2], l[1::2])))
return d
term = None
other = []
with open(filename) as I:
n = 0
for line in I:
line = line.strip()
try:
int(line)
except Exception as e:
# not an int
if term: # if we have one, create the record
yield combine(term, other)
term = line
other = []
n = 0
else:
if n > 0:
other.append(line)
n += 1
# and the last one
yield combine(term, other)
if __name__ == "__main__":
import pandas as pd
import sys
df = pd.DataFrame([r for r in mkdf(sys.argv[1])])
print(df)
usage: python scriptname.py /tmp/IN ( or other file with your data)
Output:
2013 2014 term
0 118 23 abcd
1 1 45 xyz
Related
hi I have the following txt file
December
line: 285 - event ID: 67511
line: 296 - event ID: 67512
November
line: 305 - event ID: 67515
line: 300 - event ID: 67517
I want to transform it into the following data frame
df1 = pd.DataFrame(
{
"index": ["December", "December", "November", "November"],
"index1": ["285", "296", "305", "300"],
"eventid": ["67511", "67512", "64515", "64517"]})
index index1 eventid
0 December 285 67511
1 December 296 67512
2 November 305 64515
3 November 300 64517
any ideas?
I have used pattern matching to achieve what you need:
import re
import pandas as pd
res = []
month_pattern = re.compile("^\w+$")
line_pattern = re.compile("\d+")
current_month = ""
with open("FILE_PATH_TO_YOUR_DATA", "r") as f:
for line in f:
m = month_pattern.findall(line)
if len(m) > 0:
current_month = m[0]
m = line_pattern.findall(line)
if len(m) > 0:
res.append([current_month] + m)
df = pd.DataFrame(res, columns = ["index", "index1", "eventid"])
print(df)
OUTPUT
index index1 eventid
0 December 285 67511
1 December 296 67512
2 November 305 67515
3 November 300 67517
I'm fairly new to python and very new to pandas so any help would be appreciated!
I have a dataframe where the data is structured like below:
Batch_Name
Tag 1
Tag 2
2019-01
1
3
2019-02
2
3
I want to iterate through the dataframe and pull the following into a new dataframe:
The max value for each tag (there are 5 in my full data frame)
The batch name at that max value
The min value for that tag
The batch name at that min value
The average for that tag
The std for that tag
I have had a lot of trouble trying to mentally structure this, but I run into errors even trying to create the dataframe with the summary statistics. Below is my first attempt at creating a new method with the stats, I wasn't sure how to pull the batch names at all.
def tag_stats(df):
min_col = {}
min_col_batch = {}
max_col = {}
max_col_batch = {}
std_col = {}
avg_col = {}
for col in range(df.shape[3:]):
max_col[col]= df[col].max()
min_col[col]= df[col].min()
std_col[col]= df[col].std()
avg_col[col]= df[col].avg()
result = pd.DataFrame([min_col, max_col, std_col, avg_col], index=['min', 'max', 'std', 'avg'])
return result
Here is an answer based on your code!
import pandas as pd
import numpy as np
#Slightly modified your function
def tag_stats(df, tag_list):
df = df.set_index('Batch_Name')
data = {
'tag':[],
'min':[],
'max':[],
'min_batch':[],
'max_batch':[],
'std':[],
'mean':[],
}
for tag in tag_list:
values = df[tag]
data['tag'].append(tag)
data['min'].append(values.min())
data['max'].append(values.max())
data['min_batch'].append(values.idxmin())
data['max_batch'].append(values.idxmax())
data['std'].append(values.std())
data['mean'].append(values.mean())
result = pd.DataFrame(data)
return result
#Create a df using some random data
np.random.seed(1)
num_batches = 10
df = pd.DataFrame({
'Batch_Name':['batch_{}'.format(i) for i in range(num_batches)],
'Tag 1':np.random.randint(1,100,num_batches),
'Tag 2':np.random.randint(1,100,num_batches),
'Tag 3':np.random.randint(1,100,num_batches),
'Tag 4':np.random.randint(1,100,num_batches),
'Tag 5':np.random.randint(1,100,num_batches),
})
#Apply your function
cols = ['Tag 1','Tag 2','Tag 3','Tag 4','Tag 5']
summary_df = tag_stats(df, cols)
print(summary_df)
Output
tag min max min_batch max_batch std mean
0 Tag 1 2 80 batch_9 batch_6 32.200759 38.0
1 Tag 2 7 85 batch_2 batch_7 28.926919 39.9
2 Tag 3 14 97 batch_9 batch_7 33.297314 63.4
3 Tag 4 1 82 batch_7 batch_9 31.060693 37.1
4 Tag 5 4 89 batch_7 batch_1 31.212711 43.3
The comment for #It_is_Chris is great too, here is an answer based on it
import pandas as pd
import numpy as np
#Create a df using some random data
np.random.seed(1)
num_batches = 10
df = pd.DataFrame({
'Batch_Name':['batch_{}'.format(i) for i in range(num_batches)],
'Tag 1':np.random.randint(1,100,num_batches),
'Tag 2':np.random.randint(1,100,num_batches),
'Tag 3':np.random.randint(1,100,num_batches),
'Tag 4':np.random.randint(1,100,num_batches),
'Tag 5':np.random.randint(1,100,num_batches),
})
#Convert to a long df and index by Batch_Name:
# index | tag | tag_value
# ------------------------------------
# batch_0 | Tag 1 38 | 38
# batch_1 | Tag 1 13 | 13
# batch_2 | Tag 1 73 | 73
long_df = df.melt(
id_vars = 'Batch_Name',
var_name = 'tag',
value_name = 'tag_value',
).set_index('Batch_Name')
#Groupby tag and aggregate to get columns of interest
summary_df = long_df.groupby('tag').agg(
max_value = ('tag_value','max'),
max_batch = ('tag_value','idxmax'),
min_value = ('tag_value','min'),
min_batch = ('tag_value','idxmin'),
mean_value = ('tag_value','mean'),
std_value = ('tag_value','std'),
).reset_index()
summary_df
Output:
tag max_value max_batch min_value min_batch mean_value std_value
0 Tag 1 80 batch_6 2 batch_9 38.0 32.200759
1 Tag 2 85 batch_7 7 batch_2 39.9 28.926919
2 Tag 3 97 batch_7 14 batch_9 63.4 33.297314
3 Tag 4 82 batch_9 1 batch_7 37.1 31.060693
4 Tag 5 89 batch_1 4 batch_7 43.3 31.212711
I've a large String looking like this : '2002 | 09| 90|NUMBER|SALE|CLIENT \n 2002 | 39| 96|4958|100|James ...' split by "|" and "\n". the size of each line is the same, what's the best way to turn this into a dataframe looking like this :
2002 09 90 NUMBER SALE CLIENT
2002 39 96 4958 100 James
.....
You can pass in the StringIO object to the pandas.read_csv method to get the desired result. Example,
Try this:
from io import StringIO
import pandas as pd
data = StringIO("""'2002 | 09| 90|NUMBER|SALE|CLIENT \n 2002 | 39| 96|4958|100|James """)
df = pd.read_csv(data, sep=r"\s*\|\s*")
print(df)
Output:
2002 09 90 NUMBER SALE CLIENT
0 2002 39 96 4958 100 James
UPDATE (As per your requirements discussed in comments):
from io import StringIO
import pandas as pd
import re
data = StringIO("""Start Date str_date B N C \n Calculation notional cal_nt C B R\n price of today price_of_today N B R \n""")
lines = []
for line in data:
line = line.strip()
line = re.search(r"^(.*)\s(.*?\s.*?\s.*?\s.*?)$", line)
grp1 = line.group(1)
grp2 = line.group(2)
line = "|".join([grp1, "|".join(grp2.split(" "))])
lines.append(line)
data = StringIO("\n".join(lines))
columns = ["header1", "header2", "header3", "header4", "header5"]
df = pd.read_csv(data, names=columns, sep=r"\s*\|\s*")
print(df)
I need to compare two values MC and JT from 2 tables:
EID MolIdx TEStart TEEnd TE TZone TBulkBE TBulkAE MC JT zavg vabs vzavg xyd.x xyd.y xydist nnbw vabsprev midhb
0 370 36700 36800 110 20 36150 37090 0 0 -8.25705 0.219113 -0.000800014 20.8926 41.4347 5.75852 0 4.13067 0
1 423 17950 18150 210 180 17400 18430 1 0 -4.26426 0.586578 -0.053 77.22 85.2104 22.0534 0 3.551 0
2 468 41790 42020 240 50 41360 42380 0 0 7.82681 0.181248 -0.00269566 90.0646 92.7698 5.0841 0 4.19304 0
and
EID MolIdx TEStart TEEnd TE TZone TBulkBE TBulkAE MC JT zavg vabs vzavg xyd.x xyd.y xydist nnbw vabsprev midhb
0 370 36700 36800 110 20 36150 37090 0 0 -0.846655 0.0218695 2.59898e-05 2.0724 4.1259 0.583259 10 0.412513 0
1 423 17950 18150 210 180 17400 18780 1 0 -0.453311 0.058732 -0.00526783 7.7403 8.52544 2.19627 0 0.354126 0
2 468 41790 42020 240 70 41360 42380 0 0 0.743716 0.0181613 -0.000256186 9.08777 9.21395 0.502506 0 0.419265 0
I need to do it using module csv. I know how to do it using pandas and xlrd, but using csv don't know.
Desire output:
Number_of_strings MC JT
And print strings, where values are different
import csv
old = csv.reader(open('old.csv', 'rb'), delimiter=',')
row1 = old.next()
new = csv.reader(open('new.csv', 'rb'), delimiter=',')
row2 = new.next()
if (row1[8] == row2[8]) and (row1[9] == row2[9]):
continue
else:
print row1[0] + ':' + row1[8] + '!=' + row2[8]
You can try something like the following:
old = list(csv.reader(open('old.csv', 'rb'), delimiter=','))
new = list(csv.reader(open('new.csv', 'rb'), delimiter=','))
old = zip(*old)
new = zip(*new)
print ['%s-%s-%s'%(str(a), str(b), str(c)) for a, b, c in zip(old[0], new[8], old[8]) if b != c]
First, we get a list of lists. zip(*x) will transpose a list of lists. The rest should be easy to decipher ...
You can actually put whatever you want within the string ...
I have a csv file with 2 columns, representing a distribution of items per year which looks like this:
A B
1900 10
1901 2
1903 5
1908 8
1910 25
1925 3
1926 4
1928 1
1950 10
etc, about 15000 lines.
When making a distribution diagram based on this data, it's too many points on an axe, not very pretty. I want to group rows by blocks of 25 years, so that at the end I would have less points at the axe.
So, for example, from 1900 till 1925 I would have a sum of produced items, 1 row in A column and 1 row in B column:
1925 53
1950 15
So far I only figured how to convert the data in csv file to int:
o=open('/dates_dist.csv', 'rU')
mydata = csv.reader(o)
def int_wrapper(mydata):
for v in reader:
yield map(int, v)
reader = int_wrapper(mydata)
Can't find how to do it further...
You could use itertools.groupby:
import itertools as IT
import csv
def int_wrapper(mydata):
for v in mydata:
yield map(int, v)
with open('data', 'rU') as o:
mydata = csv.reader(o)
header = next(mydata)
reader = int_wrapper(mydata)
for key, group in IT.groupby(reader, lambda row: (row[0]-1)//25+1):
year = key*25
total = sum(row[1] for row in group)
print(year, total)
yields
(1900, 10)
(1925, 43)
(1950, 15)
Note that 1900 to 1925 (inclusive) spans 26 years, not 25. So
if you want to group 25 years, given the way you are reporting the totals, you probably want the half-open interval (1900, 1925].
The expression row[0]//25 takes the year and integer divides by 25.
This number will be the same for all numbers in the range [1900, 1925).
To make the range half-open on the left, subtract and add 1: (row[0]-1)//25+1.
Here is my approach . Its definitely not the most engaging python code, but could be a way to achieve the desired output.
if __name__ == '__main__':
o=open('dates_dist.csv', 'rU')
lines = o.read().split("\n") # Create a list having each line of the file
out_dict = {}
curr_date = 0;
curr_count = 0
chunk_sz = 25; #years
if len(lines) > 0:
line_split = lines[0].split(",")
start_year = int(line_split[0])
curr_count = 0
# Iterate over each line of the file
for line in lines:
# Split at comma to get the year and the count.
# line_split[0] will be the year and line_split[1] will be the count.
line_split = line.split(",")
curr_year = int(line_split[0])
time_delta = curr_year-start_year
if time_delta<chunk_sz or time_delta == chunk_sz:
curr_count = curr_count + int(line_split[1])
else:
out_dict[start_year+chunk_sz] = curr_count
start_year = start_year+chunk_sz
curr_count = int(line_split[1])
#print curr_year , curr_count
out_dict[start_year+chunk_sz] = curr_count
print out_dict
You could create a dummy column and group by it after doing some integer division:
df['temp'] = df['A'] // 25
>>> df
A B temp
0 1900 10 76
1 1901 2 76
2 1903 5 76
3 1908 8 76
4 1910 25 76
5 1925 3 77
6 1926 4 77
7 1928 1 77
8 1950 10 78
>>> df.groupby('temp').sum()
A B
temp
76 9522 50
77 5779 8
78 1950 10
My numbers are slightly different from yours since I am technically grouping from 1900-1924, 1925-1949, and 1950-1974, but the idea is the same.