I'm trying to translate part of SQuAD 1.1 dataset to Sinhalese. I don't know whether i can use the json file straight into translation
What i tried so far is making a little dataframe of SQuAD dataset and try to translate that as a demo to myself. But i got different errors. Below is the error i'm getting now. Can you help me to fix that error or tell me a better way to complete my task using python.
```import googletrans
from googletrans import Translator
import os
from google.cloud import translate_v2 as translate
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=r"C:\Users\Sathsara\Documents\Python Learning\Translation test\translationAPI\flash-medley-278816-b2012b874797.json"
# create a translator object
translator = Translator()
# use translate method to translate a string - by default, the destination language is english
translated = translator.translate('I am Sathsara Rasantha',dest='si')
# the translate method returns an object
print(translated)
# obtain translated string by using attribute .text
translated.text
import pandas as pd
translate_example = pd.read_json("example2.json")
translate_example
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(translate_example.shape[0]):
topic = translate_example.iloc[i,0]['paragraphs']
for sub_para in topic:
for q_a in sub_para['qas']:
questions.append(q_a['question'])
answers_start.append(q_a['answers'][0]['answer_start'])
answers_text.append(q_a['answers'][0]['text'])
contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})
df
df=df.loc[0:2,:]
df
# make a deep copy of the data frame
df_si = df.copy()
# translate columns' name using rename function
df_si.rename(columns=lambda x: translator.translate(x).text, inplace=True)
df_si.columns
translations = {}
for column in df_si.columns:
# unique elements of the column
unique_elements = df_si[column].unique()
for element in unique_elements:
# add translation to the dictionary
translations[element] = translator.translate(element,dest='si').text
print(translations)
# modify all the terms of the data frame by using the previously created dictionary
df_si.replace(translations, inplace = True)
# check translation
df_si.head()```
This is the error i get
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) <ipython-input-24-f55a5ca59c36> in <module>
> 5 for element in unique_elements:
> 6 # add translation to the dictionary
> ----> 7 translations[element] = translator.translate(element,dest='si').text
> 8
> 9 print(translations)
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in translate(self,
> text, dest, src)
> 170
> 171 origin = text
> --> 172 data = self._translate(text, dest, src)
> 173
> 174 # this code will be updated when the format is changed.
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in
> _translate(self, text, dest, src)
> 73 text = text.decode('utf-8')
> 74
> ---> 75 token = self.token_acquirer.do(text)
> 76 params = utils.build_params(query=text, src=src, dest=dest,
> 77 token=token)
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in do(self, text)
> 199 def do(self, text):
> 200 self._update()
> --> 201 tk = self.acquire(text)
> 202 return tk
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in acquire(self,
> text)
> 144 a = []
> 145 # Convert text to ints
> --> 146 for i in text:
> 147 val = ord(i)
> 148 if val < 0x10000:
>
> TypeError: 'numpy.int64' object is not iterable
Related
Im working with nested JSON data with Pandas, but i have a problem once i extract the dataframe of the nested data.
The data looks like:
[{"export_id":"COL-EXP-1894","origin_office":"EXAMPLE","destination_office":"","incoterms":"","shipment_date":"","export_date":"2023-01-01","origin_port":"Buenaventura","destination_port":"New York/New Jersey","bl_number":null,"shipping_line":null,"shipping_mode":null,"vessel_name":null,"voyage_number":null,"reservation_number":null,"container_number":null,"seal_number":null,"eta":null,"etd":null,"export_status":"in_progress","ico_list":\[\]}\]
And reading like that all is good, but some data have ico_list like:
[{"export_id":"COL-EXP-1894","origin_office":"EXAMPLE","destination_office":"","incoterms":"","shipment_date":"","export_date":"2023-01-01","origin_port":"Buenaventura","destination_port":"New York/New Jersey","bl_number":null,"shipping_line":null,"shipping_mode":null,"vessel_name":null,"voyage_number":null,"reservation_number":null,"container_number":null,"seal_number":null,"eta":null,"etd":null,"export_status":"in_progress","ico_list":[{"ico_id":"03-0178-436-23","contract_id":"CI-1046","customer":null,"origin_office":"example","destination_office":"example","incoterm":"CIF","quality":"ML","mark":"example","packaging_type":"Nitrogen-Flushed Vac-Packed Boxes - 35KG","packaging_capacity":35.0,"units":1,"quantity":35.0,"certification":null}]}]
And not just one like the example, can be more, so i implemented this:
if response.status_code == 200:
data_str = response.text
try:
atlas_api_data = json.loads(data_str)
df_atlas = pd.json_normalize(atlas_api_data)
#print(df_atlas)
except:
print('ErrorOccured While Parsing JSON ATLAS API TO Dataframe')
df_atlas2 = pd.json_normalize(df_atlas['ico_list'].loc[95])
for i, row in df_atlas.iterrows():
export_id = row['export_id']
origin_office = row['origin_office']
destination_office = row['destination_office']
export_date = row['export_date']
origin_port = row['origin_port']
destination_port = row['destination_port']
bl_number = row['bl_number']
shipping_line = row['shipping_line']
shipping_mode = row['shipping_mode']
vessel_name = row['vessel_name']
voyage_number = row['voyage_number']
reservation_number = row['reservation_number']
container_number = row['container_number']
seal_number = row['seal_number']
export_status = row['export_status']
values = [export_id,origin_office,destination_office,export_date,origin_port,destination_port,
bl_number,shipping_line,shipping_mode,vessel_name,voyage_number,reservation_number,container_number,
seal_number,export_status]
data_list.append(values)
df_atlas2 = pd.json_normalize(df_atlas['ico_list'].loc[i])
if df_atlas2.empty:
print('Empty DF')
else:
for row_ico, j in df_atlas2.iterrows():
ico_id = row_ico['ico_id']
contract_id = row_ico['contract_id']
customer = row_ico['customer']
incoterm = row_ico['incoterm']
quality = row_ico['quality']
mark = row_ico['mark']
packaging_type = row_ico['packaging_type']
packaging_capacity = row_ico['packaging_capacity']
units = row_ico['units']
quantity = row_ico['quantity']
certification = row_ico['certification']
ico_values = [export_id,ico_id,contract_id,customer,incoterm,quality,mark,packaging_type,packaging_capacity,units,quantity,certification]
data_ico_list.append(ico_values)
In this way i extract only the data that i need, and for the first level worked, but when i go to the second iterrows() it says
TypeError Traceback (most recent call last)
Cell In [4], line 43
41 else:
42 for row_ico, j in df_atlas2.iterrows():
---> 43 ico_id = row_ico['ico_id']
44 contract_id = row_ico['contract_id']
45 customer = row_ico['customer']
TypeError: 'int' object is not subscriptable
When printing the df_atlas2 it looks normal, like this:
variable: df_atlas2 before goes into iterrrows()
I tried using df_atlas2['ico_id'].astype(str) with all the columns and ico_id = str(row_ico['ico_id']) but still getting the message
If you know how to solve this, hundred thanks!
I want to use stanza for tokenizing, pos tagging and parsing some text I have, but it keeps giving me this error. I've tried changing the way a I call it but nothing happens. Any ideas?
My code(Here a iterate through a list of list of text and appli stanza to each one)
t = time()
data_stanza = []
for text in data:
stz = apply_stanza(text[0])
data_stanza.append(stz)
print('Time to run: {} mins'.format(round((time() - t) / 60, 2)))
This is the function I use to apply_stanza to each text:
nlp = stanza.Pipeline('pt')
def apply_stanza(text):
doc = nlp(text)
All = []
for sent in doc.sentences:
for word in sent.words:
All.append((word.id,word.text,word.lemma,word.upos,word.feats,word.head,word.deprel))
return All
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-7ac303eec8e8> in <module>
3 data_staza = []
4 for text in data:
----> 5 stz = apply_stanza(text[0])
6 data_stanza.append(stz)
7
<ipython-input-16-364c3ac30f32> in apply_stanza(text)
2
3 def apply_stanza(text):
----> 4 doc = nlp(text)
5 All = []
6 for sent in doc.sentences:
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in __call__(self, doc)
174 assert any([isinstance(doc, str), isinstance(doc, list),
175 isinstance(doc, Document)]), 'input should be either str, list or Document'
--> 176 doc = self.process(doc)
177 return doc
178
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in process(self, doc)
168 for processor_name in PIPELINE_NAMES:
169 if self.processors.get(processor_name):
--> 170 doc = self.processors[processor_name].process(doc)
171 return doc
172
~\anaconda3\lib\site-packages\stanza\pipeline\mwt_processor.py in process(self, document)
31 preds = []
32 for i, b in enumerate(batch):
---> 33 preds += self.trainer.predict(b)
34
35 if self.config.get('ensemble_dict', False):
~\anaconda3\lib\site-packages\stanza\models\mwt\trainer.py in predict(self, batch, unsort)
77 self.model.eval()
78 batch_size = src.size(0)
---> 79 preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
80 pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens
81 pred_seqs = utils.prune_decoded_seqs(pred_seqs)
~\anaconda3\lib\site-packages\stanza\models\common\seq2seq_model.py in predict(self, src, src_mask, pos, beam_size)
259 done = []
260 for b in range(batch_size):
--> 261 is_done = beam[b].advance(log_probs.data[b])
262 if is_done:
263 done += [b]
~\anaconda3\lib\site-packages\stanza\models\common\beam.py in advance(self, wordLk, copy_indices)
82 # bestScoresId is flattened beam x word array, so calculate which
83 # word and beam each score came from
---> 84 prevK = bestScoresId / numWords
85 self.prevKs.append(prevK)
86 self.nextYs.append(bestScoresId - prevK * numWords)
RuntimeError: Integer division of tensors using div or / is no longer supported, and in a future release div will perform
true division as in Python 3. Use true_divide or floor_divide (// in Python) instead.
ATT: It turns after all that it was and error with the mwt module of stanza pipeline, so I just specified not to use it.
Use // for division instead of /.
Try to edit your code as follows:
print('Time to run: {} mins'.format(round((time() - t) // 60, 2)))
Using floor division (//) will floor the result to the largest possible integer.
Using torch.true_divide(Dividend, Divisor) or numpy.true_divide(Dividend, Divisor) in stead.
For example: 3/4 = torch.true_divide(3, 4)
https://pytorch.org/docs/stable/generated/torch.true_divide.html
https://numpy.org/doc/stable/reference/generated/numpy.true_divide.html
Hello I'm trying to export a gmap html using ipywidgets in jupyter notebook but am encountering the following error: - error: bad escape \u at position 0.
I'm new to programing and could use help fixing whatever is causing this error to occur. If there is any easier way to go about exporting the html file I'm happy to change approaches.
Thanks
Here is a snippet of the code: I can add the entire thing if its helpful.
import pandas as pd
import gmaps
from ipywidgets.embed import embed_minimal_html
from ipywidgets import IntSlider
gmaps.configure(api_key='XXXX')
pd.options.mode.chained_assignment = None # default='warn'
file2 = '005 lat:long.csv'
state2 = pd.read_csv(file2)
state2 = state2.rename(columns={'Address1': 'address', 'City':'city',
'State':'state', 'Zip': 'zip'})
storenumbs = state2['Store'].str.split('#', expand=True)
state2 = state2.join(storenumbs)
state2 = state2.drop(['Store', 0], axis=1)
state2 = state2.rename(columns={1: 'store_#'})
state2['store_#'] = state2['store_#'].astype(int)
fig = gmaps.figure(center=(42.5, -71.4), map_type='TERRAIN', zoom_level=9.8)
scale = 4
one_layer = (gmaps.symbol_layer(low_points_lat_long, fill_color='red', stroke_color='red', scale= scale))
two_layer = (gmaps.symbol_layer(low_med_points_lat_long, fill_color='red', stroke_color='yellow', scale= scale))
three_layer = (gmaps.symbol_layer(med_high_points_lat_long, fill_color='yellow', stroke_color='green', scale= scale))
four_layer = (gmaps.symbol_layer(high_points_lat_long, fill_color='green', stroke_color='green', scale= scale))
fig.add_layer(one_layer)
fig.add_layer(two_layer)
fig.add_layer(three_layer)
fig.add_layer(four_layer)
fig
embed_minimal_html('export.html', views=[fig]
Long Form Error Bellow
)
KeyError Traceback (most recent call last)
~/miniconda3/lib/python3.7/sre_parse.py in parse_template(source, pattern)
1020 try:
-> 1021 this = chr(ESCAPES[this][1])
1022 except KeyError:
KeyError: '\\u'
During handling of the above exception, another exception occurred:
error Traceback (most recent call last)
<ipython-input-7-c096ac365396> in <module>
20
21 slider = IntSlider(value=40)
---> 22 embed_minimal_html('export.html', views=[slider], title='Widgets export')
~/miniconda3/lib/python3.7/site-packages/ipywidgets/embed.py in embed_minimal_html(fp, views, title, template, **kwargs)
300 {embed_kwargs}
301 """
--> 302 snippet = embed_snippet(views, **kwargs)
303
304 values = {
~/miniconda3/lib/python3.7/site-packages/ipywidgets/embed.py in embed_snippet(views, drop_defaults, state, indent, embed_url, requirejs, cors)
266 widget_views = u'\n'.join(
267 widget_view_template.format(view_spec=escape_script(json.dumps(view_spec)))
--> 268 for view_spec in data['view_specs']
269 )
270
~/miniconda3/lib/python3.7/site-packages/ipywidgets/embed.py in <genexpr>(.0)
266 widget_views = u'\n'.join(
267 widget_view_template.format(view_spec=escape_script(json.dumps(view_spec)))
--> 268 for view_spec in data['view_specs']
269 )
270
~/miniconda3/lib/python3.7/site-packages/ipywidgets/embed.py in escape_script(s)
239 involving `<` is readable.
240 """
--> 241 return script_escape_re.sub(r'\u003c\1', s)
242
243 #doc_subst(_doc_snippets)
~/miniconda3/lib/python3.7/re.py in _subx(pattern, template)
307 def _subx(pattern, template):
308 # internal: Pattern.sub/subn implementation helper
--> 309 template = _compile_repl(template, pattern)
310 if not template[0] and len(template[1]) == 1:
311 # literal replacement
~/miniconda3/lib/python3.7/re.py in _compile_repl(repl, pattern)
298 def _compile_repl(repl, pattern):
299 # internal: compile replacement pattern
--> 300 return sre_parse.parse_template(repl, pattern)
301
302 def _expand(pattern, match, template):
~/miniconda3/lib/python3.7/sre_parse.py in parse_template(source, pattern)
1022 except KeyError:
1023 if c in ASCIILETTERS:
-> 1024 raise s.error('bad escape %s' % this, len(this))
1025 lappend(this)
1026 else:
error: bad escape \u at position 0
This is an error in Python 3.7, and an issue with Python 3.6 (but it is OK with Python 2.7).
If you use a raw string (prefixed by "r") for the replacement in re.sub function, then the \u is escaped. For instance, r'\u003c\1' is like '\\u003c\\1': this is a string '\u', followed by '003c' and \1.
The solution is to write:
return script_escape_re.sub('\u003c\\1', s)
Quoting the documentation:
Changed in version 3.7: Unknown escapes in repl consisting of '\' and an ASCII letter now are errors.
I was facing a similar issue while trying to escape Unicode characters that have the pattern \uXXXX. Let's take a string containing Unicode characters:
>>> text = "The \u201c\u3010\u3011\u201d in this template are used to mark the variables"
>>> text
'The “【】” in this template are used to mark the variables'
Escape the Unicode characters:
>>> text = text.encode('unicode_escape').decode('ascii')
>>> text
'The \\u201c\\u3010\\u3011\\u201d in this template are used to mark the variables'
And then replace them using re.sub(r'\\u(.){4}', '', text):
>>> import re
>>> re.sub(r'\\u(.){4}', '', text)
'The in this template are used to mark the variables'
I have had the same issue during
[m.start() for m in re.finditer('Valuation Date")', 'dummytext')]
*** sre_constants.error: unbalanced parenthesis at position 15
But it was solved with re.escape help
[m.start() for m in re.finditer(re.escape('Valuation Date")'), 'dummytext')]
Enjoy.
It's seems possible to relate with Japanese Language problem,
So I asked in Japanese StackOverflow also.
When I use string just object, it works fine.
I tried to encode but I couldn't find the reason of this error.
Could you please give me advice?
MeCab is an open source text segmentation library for use with text written in the Japanese language originally developed by the Nara Institute of Science and Technology and currently maintained by Taku Kudou (工藤拓) as part of his work on the Google Japanese Input project.
https://en.wikipedia.org/wiki/MeCab
sample.csv
0,今日も夜まで働きました。
1,オフィスには誰もいませんが、エラーと格闘中
2,デバッグばかりしていますが、どうにもなりません。
This is Pandas Python3 code
import pandas as pd
import MeCab
# https://en.wikipedia.org/wiki/MeCab
from tqdm import tqdm_notebook as tqdm
# This is working...
df = pd.read_csv('sample.csv', encoding='utf-8')
m = MeCab.Tagger ("-Ochasen")
text = "りんごを食べました、そして、みかんも食べました"
a = m.parse(text)
print(a)# working!
# But I want to use Pandas's Series
def extractKeyword(text):
"""Morphological analysis of text and returning a list of only nouns"""
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parseToNode(text)
keywords = []
while node:
if node.feature.split(",")[0] == u"名詞": # this means noun
keywords.append(node.surface)
node = node.next
return keywords
aa = extractKeyword(text) #working!!
me = df.apply(lambda x: extractKeyword(x))
#TypeError: ("in method 'Tagger_parseToNode', argument 2 of type 'char const *'", 'occurred at index 0')
This is the trace error
りんご リンゴ りんご 名詞-一般
を ヲ を 助詞-格助詞-一般
食べ タベ 食べる 動詞-自立 一段 連用形
まし マシ ます 助動詞 特殊・マス 連用形
た タ た 助動詞 特殊・タ 基本形
、 、 、 記号-読点
そして ソシテ そして 接続詞
、 、 、 記号-読点
みかん ミカン みかん 名詞-一般
も モ も 助詞-係助詞
食べ タベ 食べる 動詞-自立 一段 連用形
まし マシ ます 助動詞 特殊・マス 連用形
た タ た 助動詞 特殊・タ 基本形
EOS
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-174-81a0d5d62dc4> in <module>()
32 aa = extractKeyword(text) #working!!
33
---> 34 me = df.apply(lambda x: extractKeyword(x))
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4260 f, axis,
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
4264 return self._apply_broadcast(f, axis)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4356 try:
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
4360 except Exception as e:
<ipython-input-174-81a0d5d62dc4> in <lambda>(x)
32 aa = extractKeyword(text) #working!!
33
---> 34 me = df.apply(lambda x: extractKeyword(x))
<ipython-input-174-81a0d5d62dc4> in extractKeyword(text)
20 """Morphological analysis of text and returning a list of only nouns"""
21 tagger = MeCab.Tagger('-Ochasen')
---> 22 node = tagger.parseToNode(text)
23 keywords = []
24 while node:
~/anaconda3/lib/python3.6/site-packages/MeCab.py in parseToNode(self, *args)
280 __repr__ = _swig_repr
281 def parse(self, *args): return _MeCab.Tagger_parse(self, *args)
--> 282 def parseToNode(self, *args): return _MeCab.Tagger_parseToNode(self, *args)
283 def parseNBest(self, *args): return _MeCab.Tagger_parseNBest(self, *args)
284 def parseNBestInit(self, *args): return _MeCab.Tagger_parseNBestInit(self, *args)
TypeError: ("in method 'Tagger_parseToNode', argument 2 of type 'char const *'", 'occurred at index 0')w
I see you got some help on the Japanese StackOverflow, but here's an answer in English:
The first thing to fix is that read_csv was treating the first line of your example.csv as the header. To fix that, use the names argument in read_csv.
Next, df.apply will by default apply the function on columns of the dataframe. You need to do something like df.apply(lambda x: extractKeyword(x['String']), axis=1), but this won't work because each sentence will have a different number of nouns and Pandas will complain it cannot stack a 1x2 array on top of a 1x5 array. The simplest way is to apply on the Series of String.
The final problem is, there's a bug in the MeCab Python3 bindings: see https://github.com/SamuraiT/mecab-python3/issues/3 You found a workaround by running parseToNode twice, you can also call parse before parseToNode.
Putting all these three things together:
import pandas as pd
import MeCab
df = pd.read_csv('sample.csv', encoding='utf-8', names=['Number', 'String'])
def extractKeyword(text):
"""Morphological analysis of text and returning a list of only nouns"""
tagger = MeCab.Tagger('-Ochasen')
tagger.parse(text)
node = tagger.parseToNode(text)
keywords = []
while node:
if node.feature.split(",")[0] == u"名詞": # this means noun
keywords.append(node.surface)
node = node.next
return keywords
me = df['String'].apply(extractKeyword)
print(me)
When you run this script, with the example.csv you provide:
➜ python3 demo.py
0 [今日, 夜]
1 [オフィス, 誰, エラー, 格闘, 中]
2 [デバッグ]
Name: String, dtype: object
parseToNode fail everytime ,
so needed to put this code
tagger.parseToNode('dummy')
before
node = tagger.parseToNode(text)
and It's worked!
But I don't know the reason, maybe parseToNode method has bug..
def extractKeyword(text):
"""Morphological analysis of text and returning a list of only nouns"""
tagger = MeCab.Tagger('-Ochasen')
tagger.parseToNode('ダミー')
node = tagger.parseToNode(text)
keywords = []
while node:
if node.feature.split(",")[0] == u"名詞": # this means noun
keywords.append(node.surface)
node = node.next
return keywords
I have to get the pure text out of a xml-node and its child nodes, or what else these strange inner-tags are:
Example-Nodes:
<BookTitle>
<Emphasis Type="Italic">Z</Emphasis>
= 63 - 100
</BookTitle>
or:
<BookTitle>
Mtn
<Emphasis Type="Italic">Z</Emphasis>
= 74 - 210
</BookTitle>
I have to get:
Z = 63 - 100
Mtn Z = 74 - 210
Remember, this is just an example! There could be any type of "Child-Nodes" inside the BookTitle-Node, and all I need is the pure Text inside BookTitle.
I tried:
tagtext = root.find('.//BookTitle').text
print tagtext
but .text can't deal with this strange xml-nodes and gives me a "NoneType" back
Regards & Thanks!
That's not the text of the BookTitle node, it's the tail of the Emphasis node. So you should do something like:
def parse(el):
text = el.text.strip() + ' ' if el.text.strip() else ''
for child in el.getchildren():
text += '{0} {1}\n'.format(child.text.strip(), child.tail.strip())
return text
Which gives you:
>>> root = et.fromstring('''
<BookTitle>
<Emphasis Type="Italic">Z</Emphasis>
= 63 - 100
</BookTitle>''')
>>> print parse(root)
Z = 63 - 100
And for:
>>> root = et.fromstring('''
<BookTitle>
Mtn
<Emphasis Type="Italic">Z</Emphasis>
= 74 - 210
</BookTitle>''')
>>> print parse(root)
Mtn Z = 74 - 210
Which should give you a basic idea what to do.
Update: Fixed the whitespace...
You can use the minidom parser. Here is an example:
from xml.dom import minidom
def strip_tags(node):
text = ""
for child in node.childNodes:
if child.nodeType == doc.TEXT_NODE:
text += child.toxml()
else:
text += strip_tags(child)
return text
doc = minidom.parse("<your-xml-file>")
text = strip_tags(doc)
The strip_tags recursive function will browse the xml tree and extract the text in order.