I have an Excel file contain data like in that picture.
"doc_id" refers to the document ID where the text comes from. In our example, we have 4 documents (doc_id from 0 to 3).
I want to get the values of "text" in the first 5 pages of each document OR before Table of Contents.
With our example, it should return:
"A0","A1","B1","A3"
(Note that we don't want B0, C0, D0, C1 because they occur after Table of Contents of that document, and we don't want A2 and B3 because they have page_id >= 5)
I don't understand how we can create condition to "break" the iteration in each doc_id once we find Table of Contents or page_id == 5 and move to the next doc_id.
I tried like this and I'm stuck.
import pandas as pd
data = pd.read_csv('book2.csv')
test_data = data['text']
doc_id = data['doc_id']
page_id = data['page_id']
def TOC(text):
return 'content' in text
def new_doc():
if i==0:
return False
elif doc_id[i] != doc_id[i-1]:
return True
i=0
while i < len(test_data):
stop=0
while stop == 0 and not new_doc():
if TOC(test_data[i]):
print('toc')
stop=1
else:
print(doc_id[i],test_data[i])
i+=1
Appreciate your help. Thanks!
See if this helps
a = df[df.page_id<5]
def tex(x):
try:
if (x.any()):
i = x.index[x.str.contains('Table')][0]
except IndexError :
i = x.index[-1]+1
return i
a[a.index<a.groupby('doc_id')['text'].transform(tex)]['text'].to_list()
Output
['A0', 'A1', 'B1', 'A3']
you have to iterate through whole document
import pandas as pd
data = pd.read_csv('book2.csv')[['page_id', 'doc_id', 'text']]
curr_doc_id = -1
before_toc = False
for i, row in data.iterrows():
if curr_doc_id < row.doc_id:
curr_doc_id = row.doc_id
before_toc = True
if row.text == "Table of Contents":
before_toc = False
if before_toc and row.page_id < 5:
print(row)
*code wasn't tested
Related
Hello I'm trying to create a program which filtering my data from more than 20 excel file and I want to create a new excel file for export values from my loop. My code little bit complicated I'm new at python.
for index, row in data.iterrows():
def tarih_aralik(Depo):
try:
deneme5 = data.loc[(data['Tarih'] >= data_time) & (data['Tarih'] <= data_time2) & (data['Depo'] == Depo)]
right = deneme5.groupby(['Tarih', deneme5['Adet'] > 0])['Adet'].sum().unstack()
right = right.rename(columns={True: 'Positive', False: 'Negative'})
deneme5 = deneme5.join(right, on=None, how='right')
deneme5 = deneme5['Positive'].sum()
deneme5 = int(deneme5)
print(deneme5)
except:
print("0")
return result
k101 = data.loc[data['Depo'] == 'K101', 'Adet'].sum()
k104 = data.loc[data['Depo'] == 'K104', 'Adet'].sum()
a = print("-->",row.T_kod, "-", row.Açıklama, "<--","\n", k101, "adt K101 toplam",",", k104, "adt K104 toplam","\n",data_time,"--",data_time2)
Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)
This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result
I am trying to read the XML file and convert it to pandas. However it returns empty data
This is the sample of xml structure:
<Instance ID="1">
<MetaInfo StudentID ="DTSU040" TaskID="LP03_PR09.bLK.sh" DataSource="DeepTutorSummer2014"/>
<ProblemDescription>A car windshield collides with a mosquito, squashing it.</ProblemDescription>
<Question>How does this work tion?</Question>
<Answer>tthis is my best </Answer>
<Annotation Label="correct(0)|correct_but_incomplete(1)|contradictory(0)|incorrect(0)">
<AdditionalAnnotation ContextRequired="0" ExtraInfoInAnswer="0"/>
<Comments Watch="1"> The student forgot to tell the opposite force. Opposite means opposite direction, which is important here. However, one can argue that the opposite is implied. See the reference answers.</Comments>
</Annotation>
<ReferenceAnswers>
1: Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.
</ReferenceAnswers>
</Instance>
I have tried this code, however it's not working on my side. It returns empty dataframe.
import pandas as pd
import xml.etree.ElementTree as et
xtree = et.parse("grade_data.xml")
xroot = xtree.getroot()
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", 'Question', 'Answer',
'ContextRequired', 'ExtraInfoInAnswer', 'Comments', 'Watch', 'ReferenceAnswers']
rows = []
for node in xroot:
s_name = node.attrib.get("ID")
s_student = node.find("StudentID")
s_task = node.find("TaskID")
s_source = node.find("DataSource")
s_desc = node.find("ProblemDescription")
s_question = node.find("Question")
s_ans = node.find("Answer")
s_label = node.find("Label")
s_contextrequired = node.find("ContextRequired")
s_extraInfoinAnswer = node.find("ExtraInfoInAnswer")
s_comments = node.find("Comments")
s_watch = node.find("Watch")
s_referenceAnswers = node.find("ReferenceAnswers")
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers,
})
out_df = pd.DataFrame(rows, columns = df_cols)
The problem in your solution was that the "element data extraction" was not done properly. The xml you mentioned in the question is nested in several layers. And that is why we need to recursively read and extract the data. The following solution should give you what you need in this case. Although I would encourage you to look at this article and the python documentation for more clarity.
Method: 1
import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET
def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True):
"""Parse the input XML source and store the result in a pandas
DataFrame with the given columns.
For xml_source = xml_file, Set: source_is_file = True
For xml_source = xml_string, Set: source_is_file = False
<element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
<child1>Child 1 Text</child1>
<child2>Child 2 Text</child2>
<child3>Child 3 Text</child3>
</element>
Note that for an xml structure as shown above, the attribute information of
element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
as element.text and the name of the tag itself can be accessed with
element.tag.
"""
if source_is_file:
xtree = ET.parse(xml_source) # xml_source = xml_file
xroot = xtree.getroot()
else:
xroot = ET.fromstring(xml_source) # xml_source = xml_string
consolidator_dict = dict()
default_instance_dict = {label: None for label in df_cols}
def get_children_info(children, instance_dict):
# We avoid using element.getchildren() as it is deprecated.
# Instead use list(element) to get a list of attributes.
for child in children:
#print(child)
#print(child.tag)
#print(child.items())
#print(child.getchildren()) # deprecated method
#print(list(child))
if len(list(child))>0:
instance_dict = get_children_info(list(child),
instance_dict)
if len(list(child.keys()))>0:
items = child.items()
instance_dict.update({key: value for (key, value) in items})
#print(child.keys())
instance_dict.update({child.tag: child.text})
return instance_dict
# Loop over all instances
for instance in list(xroot):
instance_dict = default_instance_dict.copy()
ikey, ivalue = instance.items()[0] # The first attribute is "ID"
instance_dict.update({ikey: ivalue})
if show_progress:
print('{}: {}={}'.format(instance.tag, ikey, ivalue))
# Loop inside every instance
instance_dict = get_children_info(list(instance),
instance_dict)
#consolidator_dict.update({ivalue: instance_dict.copy()})
consolidator_dict[ivalue] = instance_dict.copy()
df = pd.DataFrame(consolidator_dict).T
df = df[df_cols]
return df
Run the following to generate the desired output.
xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
"ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']
df = xml2df(xml_source, df_cols, source_is_file = True)
df
Method: 2
Given you have the xml_string, you could convert xml >> dict >> dataframe. run the following to get the desired output.
Note: You will need to install xmltodict to use Method-2. This method is inspired by the solution suggested by #martin-blech at How to convert XML to JSON in Python? [duplicate]
. Kudos to #martin-blech for making it.
pip install -U xmltodict
Solution
def read_recursively(x, instance_dict):
#print(x)
txt = ''
for key in x.keys():
k = key.replace("#","")
if k in df_cols:
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
#else:
instance_dict.update({k: x.get(key)})
#print('{}: {}'.format(k, x.get(key)))
else:
#print('else: {}: {}'.format(k, x.get(key)))
# dig deeper if value is another dict
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
# add simple text associated with element
if k=='#text':
txt = x.get(key)
# update text to corresponding parent element
if (k!='#text') and (txt!=''):
instance_dict.update({k: txt})
return (instance_dict, txt)
You will need the function read_recursively() given above. Now run the following.
import xmltodict, json
o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string
consolidated_dict = dict()
oi = o['Instances']['Instance']
for x in oi:
instance_dict = dict()
instance_dict, _ = read_recursively(x, instance_dict)
consolidated_dict.update({x.get("#ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df
Several issues:
Calling .find on the loop variable, node, expects a child node to exist: current_node.find('child_of_current_node'). However, since all the nodes are the children of root they do not maintain their own children, so no loop is required;
Not checking NoneType that can result from missing nodes with find() and prevents retrieving .tag or .text or other attributes;
Not retrieving node content with .text, otherwise the <Element... object is returned;
Consider this adjustment using the ternary condition expression a if condition else b to ensure variable has a value regardless:
rows = []
s_name = xroot.attrib.get("ID")
s_student = xroot.find("StudentID").text if xroot.find("StudentID") is not None else None
s_task = xroot.find("TaskID").text if xroot.find("TaskID") is not None else None
s_source = xroot.find("DataSource").text if xroot.find("DataSource") is not None else None
s_desc = xroot.find("ProblemDescription").text if xroot.find("ProblemDescription") is not None else None
s_question = xroot.find("Question").text if xroot.find("Question") is not None else None
s_ans = xroot.find("Answer").text if xroot.find("Answer") is not None else None
s_label = xroot.find("Label").text if xroot.find("Label") is not None else None
s_contextrequired = xroot.find("ContextRequired").text if xroot.find("ContextRequired") is not None else None
s_extraInfoinAnswer = xroot.find("ExtraInfoInAnswer").text if xroot.find("ExtraInfoInAnswer") is not None else None
s_comments = xroot.find("Comments").text if xroot.find("Comments") is not None else None
s_watch = xroot.find("Watch").text if xroot.find("Watch") is not None else None
s_referenceAnswers = xroot.find("ReferenceAnswers").text if xroot.find("ReferenceAnswers") is not None else None
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers
})
out_df = pd.DataFrame(rows, columns = df_cols)
Alternatively, run a more dynamic version assigning to an inner dictionary using the iterator variable:
rows = []
for node in xroot:
inner = {}
inner[node.tag] = node.text
rows.append(inner)
out_df = pd.DataFrame(rows, columns = df_cols)
Or list/dict comprehension:
rows = [{node.tag: node.text} for node in xroot]
out_df = pd.DataFrame(rows, columns = df_cols)
I have a list of lists containing company objects:
companies_list = [companies1, companies2]
I have the following function:
def get_fund_amount_by_year(companies_list):
companies_length = len(companies_list)
for idx, companies in enumerate(companies_list):
companies1 = companies.values_list('id', flat=True)
funding_rounds = FundingRound.objects.filter(company_id__in=companies1).order_by('announced_on')
amount_per_year_list = []
for fr in funding_rounds:
fr_year = fr.announced_on.year
fr_amount = fr.raised_amount_usd
if not any(d['year'] == fr_year for d in amount_per_year_list):
year_amount = {}
year_amount['year'] = fr_year
for companies_idx in range(companies_length):
year_amount['amount'+str(companies_idx)] = 0
if companies_idx == idx:
year_amount['amount'+str(companies_idx)] = fr_amount
amount_per_year_list.append(year_amount)
else:
for year_amount in amount_per_year_list:
if year_amount['year'] == fr_year:
year_amount['amount'+str(idx)] += fr_amount
return amount_per_year_list
The problem is the resulting list of dictionaries has only one amount attribute updated.
As you can see "amount0" contains all "0" amounts:
[{'amount1': 12100000L, 'amount0': 0, 'year': 1999}, {'amount1':
8900000L, 'amount0': 0, 'year': 2000}]
What am I doing wrong?
I put list of dictionaries being built in the loop and so when it iterated it overwrote the last input. I changed it to look like:
def get_fund_amount_by_year(companies_list):
companies_length = len(companies_list)
**amount_per_year_list = []**
for idx, companies in enumerate(companies_list):
companies1 = companies.values_list('id', flat=True)
funding_rounds = FundingRound.objects.filter(company_id__in=companies1).order_by('announced_on')
I have two functions, one which creates a dataframe from a csv and another which manipulates that dataframe. There is no problem the first time I pass the raw data through the lsc_age(import_data()) functions. However, I get the above-referenced error (TypeError: 'DataFrame' object is not callable) upon second+ attempts. Any ideas for how to solve the problem?
def import_data(csv,date1,date2):
global data
data = pd.read_csv(csv,header=1)
data = data.iloc[:,[0,1,4,6,7,8,9,11]]
data = data.dropna(how='all')
data = data.rename(columns={"National: For Dates 9//1//"+date1+" - 8//31//"+date2:'event','Unnamed: 1':'time','Unnamed: 4':'points',\
'Unnamed: 6':'name','Unnamed: 7':'age','Unnamed: 8':'lsc','Unnamed: 9':'club','Unnamed: 11':'date'})
data = data.reset_index().drop('index',axis=1)
data = data[data.time!='Time']
data = data[data.points!='Power ']
data = data[data['event']!="National: For Dates 9//1//"+date1+" - 8//31//"+date2]
data = data[data['event']!='USA Swimming, Inc.']
data = data.reset_index().drop('index',axis=1)
for i in range(len(data)):
if len(str(data['event'][i])) <= 3:
data['event'][i] = data['event'][i-1]
else:
data['event'][i] = data['event'][i]
data = data.dropna()
age = []
event = []
gender = []
for row in data.event:
gender.append(row.split(' ')[0])
if row[:9]=='Female 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
elif row[:7]=='Male 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
else:
n = 2
groups = row.split(' ')
event.append(' '.join(groups[n:]))
groups = row.split(' ')
age.append(groups[1])
data['age_group'] = age
data['event_simp'] = event
data['gender'] = gender
data['year'] = date2
return data
def lsc_age(data_two):
global lsc, lsc_age, top, all_performers
lsc = pd.DataFrame(data_two['event'].groupby(data_two['lsc']).count()).reset_index().sort_values(by='event',ascending=False)
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
top = pd.concat([lsc_age[lsc_age.age_group=='10 & under'].head(),lsc_age[lsc_age.age_group=='11-12'].head(),\
lsc_age[lsc_age.age_group=='13-14'].head(),lsc_age[lsc_age.age_group=='15-16'].head(),\
lsc_age[lsc_age.age_group=='17-18'].head()],ignore_index=True)
all_performers = pd.concat([lsc_age[lsc_age.age_group=='10 & under'],lsc_age[lsc_age.age_group=='11-12'],\
lsc_age[lsc_age.age_group=='13-14'],lsc_age[lsc_age.age_group=='15-16'],\
lsc_age[lsc_age.age_group=='17-18']],ignore_index=True)
all_performers = all_performers.rename(columns={'event':'no. top 100'})
all_performers['age_year_lsc'] = all_performers.age_group+' '+all_performers.year.astype(str)+' '+all_performers.lsc
return all_performers
years = [i for i in range(2008,2018)]
for i in range(len(years)-1):
lsc_age(import_data(str(years[i+1])+"national100.csv",\
str(years[i]),str(years[i+1])))
During the first call to your function lsc_age() in line
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
you are overwriting your function object with a dataframe. This is happening since you imported the function object from the global namespace with
global lsc, lsc_age, top, all_performers
Functions in Python are objects. Please see more information about this here.
To solve your problem, try to avoid the global imports. They do not seem to be necessary. Try to pass your data around through the arguments of the function.