Writing in file with specific format - python

def sauvegarder_canaux(self, nom_fichier:str) is the method giving me a problem when the file saves it only writes in this format:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
I need it to be like this:
5 : TQS : Télévision Quatres-saisons : 0.0 $ extra
This is the code that I have for now:
from canal import Canal
from forfait_tv import ForfaitTV
from abonne import Abonne
#============= Classe ===========================
class Distributeur :
"""
Description :
===========
Cette classe gère les listes de canaux, de forfaits (et plus tard
d'abonné).
Données membres privées :
======================
__canaux # [Canal] Liste de canaux existants
__forfaits # [ForfaitTV] Liste des forfaits disponibles
"""
#----------- Constructeur -----------------------------
def __init__(self):
self.__canaux = None
self.__forfaits = None
#code
self.__canaux = [] #list
self.__forfaits = [] #list
#----------- Accesseurs/Mutateurs ----------------------
def ajouter_canal(self,un_canal:Canal):
self.__canaux.append(un_canal)
def chercher_canal (self,p_poste:int):
i=0
postex = None
poste_trouve=None
for i in range(0,len(self.__canaux),1):
postex=self.__canaux[i]
if postex.get_poste()== p_poste:
poste_trouve=postex
return print(poste_trouve)
def telecharger_canaux(self,nom_fichier:str):
fichierCanaux = open(nom_fichier, "r")
for line in fichierCanaux:
eleCanal = line.strip(" : ")
canal = Canal(eleCanal[0],eleCanal[1],eleCanal[2],eleCanal[3])
self.__canaux.append(canal)
return canal
def sauvegarder_canaux(self, nom_fichier:str):
fichCanaux = open(nom_fichier,"w")
for i in self.__canaux:
fichCanaux.write(str(i) + "\n")
fichCanaux.close()

You need only to edit the string before you write it. The string.replace command is your friend. Perhaps ...
for i in self.__canaux:
out_line = str(i)
for char in "-(,":
out_line = out_line.replace(char, ':')
fichCanaux.write(out_line + "\n")

If removing the accents is okay, you can normalize the text to NFD with unicodedata, then find the segments of interest, modify them with the desired formatting, and replace them with the formatted segments using regex:
import unicodedata
import re
def format_string(test_str):
# normalize accents
test_str = test_str.decode("UTF-8")
test_str = unicodedata.normalize('NFD', test_str).encode('ascii', 'ignore')
# segment patterns
segment_1_ptn = re.compile(r"""[0-9]*(\s)* # natural number
[-](\s)* # dash
(\w)*(\s)* # acronym
""",
re.VERBOSE)
segment_2_ptn = re.compile(r"""(\w)*(\s)* # acronym
(\() # open parenthesis
((\w*[-]*)*(\s)*)* # words
""",
re.VERBOSE)
segment_3_ptn = re.compile(r"""((\w*[-]*)*(\s)*)* # words
(,)(\s)* # comma
[0-9]*(.)[0-9]*(\s)*(\$)(\s) # real number
""",
re.VERBOSE)
# format data
segment_1_match = re.search(segment_1, test_str).group()
test_str = test_str.replace(segment_1_match, " : ".join(segment_1_match.split("-")))
segment_2_match = re.search(segment_2, test_str).group()
test_str = test_str.replace(segment_2_match, " : ".join(segment_2_match.split("(")))
segment_3_match = re.search(segment_3, test_str).group()
test_str = test_str.replace(segment_3_match, " : ".join(segment_3_match.split(",")))[:-1]
test_str = " : ".join([txt.strip() for txt in test_str.split(":")])
return test_str
Then you can call this function within sauvegarder_canaux
def sauvegarder_canaux(self, nom_fichier:str):
with open(nom_fichier, "w") as fichCanaux
for i in self.__canaux:
fichCanaux.write(format_string(str(i)) + "\n")
You can also add format_string as a method within your Distributeur class.
Example input:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
Example output:
5 : TQS : Television Quatres-saisons : 0.0 $ extra

Related

Copy value in var from structured data

I have a bulk data in 'bulk_data' var, now need to find and copy it in sub var as per below, How to do it with python
bulk_data = """F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
"""
typeA1 = <start with RM/>"JRSE1420"<until space> in 1st line
typeA2 = <start with RM/>"JRSE935"<until space> in 2nd line
typeA3 = <start with RM/>"JRSE10"<until space> in 3rd line
typeB1 = <start with space after typeA1>"Mod/4758"<until end of the line> in 1rd line
typeB2 = <start with space after typeA2>"Mod/23"<until end of the line> in 2nd line
typeB3 = <start with space after typeA3>"Mod/445"<until end of the line> in 3rd line
Overall result would be:
typeA1 = 'JRSE1420'
typeA2 = 'JRSE935'
typeA3 = 'JRSE10'
typeB1 = 'Mod/4758'
typeB2 = 'Mod/23'
typeB3 = 'Mod/445'
And also is there any study manual to deal with such type of data manipulation ?
You can use the re module
import re
bulk_data = '''F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
'''
ptrn1 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
RM\/ #matches "RM" followed by "/"
(\w+) #matches one or more alphanumeric character and the undescore
\b #matches empty string
.* #matches anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
ptrn2 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
\s #matches a space character
(Mod.*) #matches "Mod" follow by 0 or more of anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
typeA1, typeA2, typeA3 = ptrn1.findall(bulk_data)
typeB1, typeB2, typeB3 = ptrn2.findall(bulk_data)
Why re? Looks like everything is already properly separated by different characters.
lines = bulk_data.splitlines()
typeA1_, typeB1 = lines[0].split(' ')
typeA1 = typeA1_.split('/')[1]
...
count = 1
li = []
with open('data') as f:
for line in f:
line = line.split()
if line:
a, b = line
a = a[a.index('/')+1:]
li.append("TypeA{} = {} ".format(count, a))
li.append("TypeB{} = {} ".format(count, b))
count += 1
for el in sorted(li):
print(el)
TypeA1 = JRSE1420
TypeA2 = JRSE935
TypeA3 = JRSE10
TypeB1 = Mod/4758
TypeB2 = Mod/23
TypeB3 = Mod/445

how to parse a file which has more than one top level element using python

I have written python script for parsing a file.
python script :
from xml.dom.minidom import parse
import xml.dom.minidom
DOMTree = xml.dom.minidom.parse("details.xml")
CallDetailRecord = DOMTree.documentElement
def getText(data):
detail = str(data)
#match = re.search(r'(.*\s)(false).*|(.*\s)(true).*',detail,re.IGNORECASE)
match_false = re.search(r'(.*\s)(false).*',detail,re.IGNORECASE)
if (match_false):
return match_false.group(2)
match_true = re.search(r'(.*\s)(true).*',detail,re.IGNORECASE)
if (match_true):
return match_true.group(2)
org_addr = CallDetailRecord.getElementsByTagName("origAddress")
for record in org_addr:
ton_1 = record.getElementsByTagName("ton")[0]
npi_1 = record.getElementsByTagName("npi")[0]
pid_1 = record.getElementsByTagName("pid")[0]
msdn_1 = record.getElementsByTagName("msisdn")[0]
org_ton = ton_1.childNodes[0].data
org_npi = npi_1.childNodes[0].data
org_pid = pid_1.childNodes[0].data
org_msdn = msdn_1.childNodes[0].data
recp_addr = CallDetailRecord.getElementsByTagName("recipAddress")
for record in recp_addr:
ton_1 = record.getElementsByTagName("ton")[0]
npi_1 = record.getElementsByTagName("npi")[0]
pid_1 = record.getElementsByTagName("pid")[0]
msdn_1 = record.getElementsByTagName("msisdn")[0]
rec_ton = ton_1.childNodes[0].data
rec_npi = npi_1.childNodes[0].data
rec_pid = pid_1.childNodes[0].data
rec_msdn = msdn_1.childNodes[0].data
dgti_addr = CallDetailRecord.getElementsByTagName("dgtiAddress")
for record in dgti_addr:
ton_1 = record.getElementsByTagName("ton")[0]
npi_1 = record.getElementsByTagName("npi")[0]
pid_1 = record.getElementsByTagName("pid")[0]
msdn_1 = record.getElementsByTagName("msisdn")[0]
dgti_ton = ton_1.childNodes[0].data
dgti_npi = npi_1.childNodes[0].data
dgti_pid = pid_1.childNodes[0].data
dgti_msdn = msdn_1.childNodes[0].data
calling_line_id = CallDetailRecord.getElementsByTagName("callingLineId")
for record in calling_line_id:
ton_1 = record.getElementsByTagName("ton")[0]
npi_1 = record.getElementsByTagName("npi")[0]
pid_1 = record.getElementsByTagName("pid")[0]
msdn_1 = record.getElementsByTagName("msisdn")[0]
clid_ton = ton_1.childNodes[0].data
clid_npi = npi_1.childNodes[0].data
clid_pid = pid_1.childNodes[0].data
clid_msdn = msdn_1.childNodes[0].data
untransl_OrigAddress = CallDetailRecord.getElementsByTagName("untranslOrigAddress")
sub_time = CallDetailRecord.getElementsByTagName("submitTime")[0]
if(sub_time):
sub_time_value = sub_time.childNodes[0].data
print " \n SUBMIT TIME: %s \n" %sub_time_value
sub_date = CallDetailRecord.getElementsByTagName("submitDate")[0]
if(sub_date):
sub_date_value = sub_date.childNodes[0].data
print " \n SUBMIT DATE: %s\n" %sub_time_value
termin_time = CallDetailRecord.getElementsByTagName("terminTime")[0]
if(termin_time):
termin_time_value = termin_time.childNodes[0].data
print " \n TERMIN TIME: %s \n" %termin_time_value
termin_date = CallDetailRecord.getElementsByTagName("terminDate")[0]
if(termin_date):
termin_date_value = termin_date.childNodes[0].data
print " \n TERMIN DATE: %s\n" %termin_time_value
status = CallDetailRecord.getElementsByTagName("status")[0]
if(status):
status_value = status.childNodes[0].data
print " \n STATUS: %s\n" %status_value
msglength = CallDetailRecord.getElementsByTagName("lengthOfMessage")[0]
if(msglength):
msglength_value = msglength.childNodes[0].data
print " \n MESSAGE LENGTH: %s\n" %msglength_value
prioIndicator = CallDetailRecord.getElementsByTagName("prioIndicator")[0]
if (prioIndicator):
#print prioIndicator.childNodes[0].data
prioIndicator_value = getText(prioIndicator.childNodes[0])
print " \n PRIO INDICATOR: %s\n" %prioIndicator_value
To reduce the Size, I'm not posting my entire script.
INPUT FILE:
<CallDetailRecord>
<origAddress>
<ton>international</ton>
<npi>telephone</npi>
<pid>plmn</pid>
<msisdn>32410000</msisdn>
</origAddress>
<recipAddress>
<ton>international</ton>
<npi>telephone</npi>
<pid>plmn</pid>
<msisdn>918337807718</msisdn>
</recipAddress>
<submitDate>14-08-20</submitDate>
<submitTime>19:36:29</submitTime>
<status>deleted</status>
<terminDate>14-08-23</terminDate>
<terminTime>19:51:52</terminTime>
<lengthOfMessage>38</lengthOfMessage>
<prioIndicator><false/></prioIndicator>
<deferIndicator><true/></deferIndicator>
<notifIndicator><false/></notifIndicator>
<recipIntlMobileSubId>26204487</recipIntlMobileSubId>
<callingLineId>
<ton>international</ton>
<npi>telephone</npi>
<pid>plmn</pid>
<msisdn>32410000</msisdn>
</callingLineId>
<smsContentDcs>0</smsContentDcs>
<messageReference>13</messageReference>
<deliveryAttempts>151</deliveryAttempts>
<untranslOrigAddress>
<ton>international</ton>
<npi>telephone</npi>
<pid>plmn</pid>
<msisdn>32410000</msisdn>
</untranslOrigAddress>
<tpDCS>0</tpDCS>
<genericUrgencyLevel>bulk</genericUrgencyLevel>
<teleserviceId>4098</teleserviceId>
<recipNetworkType>gsm</recipNetworkType>
<rbdlFlags1>
10000000000000000000000000000000
</rbdlFlags1>
</CallDetailRecord>
Script works fine for this file. But suppose consider I have more than one
CallDetailRecord>, then how to parse that file.
EXAMPLE:
<CallDetailRecord>
.
.
.
</CallDetailRecord>
<CallDetailRecord>
.
.
.
</CallDetailRecord>
<CallDetailRecord>
.
.
.
</CallDetailRecord>
Hopefully waiting for some good results :)!!!
Use wrapper class to parse this file. Wrap your file which contains multiple top elements into a wrapper like this
< wrapper>
#your file
< /wrapper>
and then start parsing the file with the root element . The parser will construct a document with a root element wrapper containing all the elements that were in the file.

How to replace text in curly brackets with another text based on comparisons using Python Regex

I am quiet new to regular expressions. I have a string that looks like this:
str = "abc/def/([default], [testing])"
and a dictionary
dict = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
and using Python RE, I want str in this form, after comparisons of each element in dict to str:
str = "abc/def/(2.7, 2.1)"
Any help how to do it using Python RE?
P.S. its not the part of any assignment, instead it is the part of my project at work and I have spent many hours to figure out solution but in vain.
import re
st = "abc/def/([default], [testing], [something])"
dic = {'abc/def/[default]' : '2.7',
'abc/def/[testing]' : '2.1',
'bcd/xed/[something]' : '3.1'}
prefix_regex = "^[\w*/]*"
tag_regex = "\[\w*\]"
prefix = re.findall(prefix_regex, st)[0]
tags = re.findall(tag_regex, st)
for key in dic:
key_prefix = re.findall(prefix_regex, key)[0]
key_tag = re.findall(tag_regex, key)[0]
if prefix == key_prefix:
for tag in tags:
if tag == key_tag:
st = st.replace(tag, dic[key])
print st
OUTPUT:
abc/def/(2.7, 2.1, [something])
Here is a solution using re module.
Hypotheses :
there is a dictionary whose keys are composed of a prefix and a variable part, the variable part is enclosed in brackets ([])
the values are strings by which the variable parts are to be replaced in the string
the string is composed by a prefix, a (, a list of variable parts and a )
the variable parts in the string are enclosed in []
the variable parts in the string are separated by a comma followed by optional spaces
Python code :
import re
class splitter:
pref = re.compile("[^(]+")
iden = re.compile("\[[^]]*\]")
def __init__(self, d):
self.d = d
def split(self, s):
m = self.pref.match(s)
if m is not None:
p = m.group(0)
elts = self.iden.findall(s, m.span()[1])
return p, elts
return None
def convert(self, s):
p, elts = self.split(s)
return p + "(" + ", ".join((self.d[p + elt] for elt in elts)) + ")"
Usage :
s = "abc/def/([default], [testing])"
d = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
sp = splitter(d)
print(sp.convert(s))
output :
abc/def/(2.7, 2.1)
Regex is probably not required here. Hope this helps
lhs,rhs = str.split("/(")
rhs1,rhs2 = rhs.strip(")").split(", ")
lhs+="/"
print "{0}({1},{2})".format(lhs,dict[lhs+rhs1],dict[lhs+rhs2])
output
abc/def/(2.7,2.1)

Copying text changing some variables

I want to copy a text like this one (it is stored in a txt file, say template.txt) :
//
// Pyrolysis of PMMA sample exposed to a 18 kW/m² cone irradiances
//
MODULE PEL_Application
$DS_external_flux= 16.5e03 // 18 kW/m² en prenant en compte le transfert convectif.
$DS_DX = 0.005
$DS_activation_energy = 1.15e+5
$DS_arrhenius_constant = 7.0e+6
$DS_Kappa_s = 1000.
$DS_initial_fuel_mass = 1.50272
$DS_heat_of_pyrolysis = 2.0e+6
$DS_T_initial = 304.75
$DS_gas_refractive_index = 1.0
$DS_medium_refractive_index = 1.0
END MODULE PEL_Application
#include ( join( "..", "ref.pel" ) )
MODULE PEL_Application
MODULE PDE_DomainAndFields
MODULE macro_boundary_conditions
MODULE BC#inlet1
MODULE BC#temperature
type = "volumetric_pyrolysis#semi_transparent_media"
END MODULE BC#temperature
END MODULE BC#inlet1
END MODULE macro_boundary_conditions
END MODULE PDE_DomainAndFields
END MODULE PEL_Application
And I want to replace the values of
$DS_activation_energy = 1.15e+5
$DS_arrhenius_constant = 7.0e+6
with
$DS_activation_energy = new_value_1
$DS_arrhenius_constant = new_value_2
and copy the whole text in a new file called data.txt with the new values.
How can this be done in Python please ?
Thank you in advance
This should do it.
f = open("template.txt", "r")
g = open("data.txt", "w")
lines = f.readlines()
new_value_1 = "Whatever"
new_value_2 = "Whatever else"
for each in lines:
if each.strip().startswith("$DS_activation_energy"):
split_val = each.split("=")
split_val[-1] = new_value_1
new_val = "= ".join(split_val) + "\n"
g.write(new_val)
elif each.strip().startswith("$DS_arrhenius_constant"):
split_val = each.split("=")
split_val[-1] = new_value_2
new_val = "= ".join(split_val) + "\n"
g.write(new_val)
else:
g.write(each)
I hope this will work for you.
import fileinput
import sys
for line in fileinput.input('File.txt', inplace=1):
if 'string_to_replace' in line:
line = line.replace('string_to_replace','new_string')
sys.stdout.write(line)
You can do comme ça:
import re
rgx = re.compile('(\A.+?\$DS_activation_energy[ \t]*=[ \t]*)'
'[^ \t\r\n]+'
'(.+?\$DS_arrhenius_constant[ \t]*=[ \t]*)'
'[^ \t\r\n]+'
'(.+\Z)',
re.DOTALL)
with open('template.txt','rb') as f:
cont = f.read()
new_value_1,new_value_2 = 10, 892.12
with open('data.txt','wb') as f:
f.write(rgx.sub('\\1%s\\2%s\\3',cont) % (new_value_1,new_value_2))
Here is a solution which makes use of 'placeholders'.
template.txt
//
// Pyrolysis of PMMA sample exposed to a 18 kW/m² cone irradiances
//
MODULE PEL_Application
$DS_external_flux= 16.5e03 // 18 kW/m² en prenant en compte le transfert convectif.
$DS_DX = 0.005
$DS_activation_energy = <activation_energy>
$DS_arrhenius_constant = <arrhenius_constant>
$DS_Kappa_s = 1000.
$DS_initial_fuel_mass = 1.50272
$DS_heat_of_pyrolysis = 2.0e+6
$DS_T_initial = 304.75
$DS_gas_refractive_index = 1.0
$DS_medium_refractive_index = 1.0
END MODULE PEL_Application
#include ( join( "..", "ref.pel" ) )
MODULE PEL_Application
MODULE PDE_DomainAndFields
MODULE macro_boundary_conditions
MODULE BC#inlet1
MODULE BC#temperature
type = "volumetric_pyrolysis#semi_transparent_media"
END MODULE BC#temperature
END MODULE BC#inlet1
END MODULE macro_boundary_conditions
END MODULE PDE_DomainAndFields
END MODULE PEL_Application
Now, read 'template.txt' and replace values:
with open('template.txt') as input_file:
template_text = input_file.read()
new_activation_energy = ...
new_arrhenius_constant = ...
new_text = template_text
new_text = new_text.replace('<activation_energy>', new_activation_energy)
new_text = new_text.replace('<new_arrhenius_constant >', new_arrhenius_constant)
with open('new_file.txt', 'w') as ouput_file:
output_file.write(new_text)

Parsing srt subtitles

I want to parse srt subtitles:
1
00:00:12,815 --> 00:00:14,509
Chlapi, jak to jde s
těma pracovníma světlama?.
2
00:00:14,815 --> 00:00:16,498
Trochu je zesilujeme.
3
00:00:16,934 --> 00:00:17,814
Jo, sleduj.
Every item into structure. With this regexs:
A:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*?)', re.DOTALL)
B:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*)', re.DOTALL)
And this code:
for i in Subtitles.RE_ITEM.finditer(text):
result.append((i.group('index'), i.group('start'),
i.group('end'), i.group('text')))
With code B I have only one item in array (because of greedy .*) and with code A I have empty 'text' because of no-greedy .*?
How to cure this?
Thanks
Why not use pysrt?
I became quite frustrated with srt libraries available for Python (often because they were heavyweight and eschewed language-standard types in favour of custom classes), so I've spent the last year or so working on my own srt library. You can get it at https://github.com/cdown/srt.
I tried to keep it simple and light on classes (except for the core Subtitle class, which more or less just stores the SRT block data). It can read and write SRT files, and turn noncompliant SRT files into compliant ones.
Here's a usage example with your sample input:
>>> import srt, pprint
>>> gen = srt.parse('''\
... 1
... 00:00:12,815 --> 00:00:14,509
... Chlapi, jak to jde s
... těma pracovníma světlama?.
...
... 2
... 00:00:14,815 --> 00:00:16,498
... Trochu je zesilujeme.
...
... 3
... 00:00:16,934 --> 00:00:17,814
... Jo, sleduj.
...
... ''')
>>> pprint.pprint(list(gen))
[Subtitle(start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), index=1, proprietary='', content='Chlapi, jak to jde s\ntěma pracovníma světlama?.'),
Subtitle(start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), index=2, proprietary='', content='Trochu je zesilujeme.'),
Subtitle(start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), index=3, proprietary='', content='Jo, sleduj.')]
The text is followed by an empty line, or the end of file. So you can use:
r' .... (?P<text>.*?)(\n\n|$)'
Here's some code I had lying around to parse SRT files:
from __future__ import division
import datetime
class Srt_entry(object):
def __init__(self, lines):
def parsetime(string):
hours, minutes, seconds = string.split(u':')
hours = int(hours)
minutes = int(minutes)
seconds = float(u'.'.join(seconds.split(u',')))
return datetime.timedelta(0, seconds, 0, 0, minutes, hours)
self.index = int(lines[0])
start, arrow, end = lines[1].split()
self.start = parsetime(start)
if arrow != u"-->":
raise ValueError
self.end = parsetime(end)
self.lines = lines[2:]
if not self.lines[-1]:
del self.lines[-1]
def __unicode__(self):
def delta_to_string(d):
hours = (d.days * 24) \
+ (d.seconds // (60 * 60))
minutes = (d.seconds // 60) % 60
seconds = d.seconds % 60 + d.microseconds / 1000000
return u','.join((u"%02d:%02d:%06.3f"
% (hours, minutes, seconds)).split(u'.'))
return (unicode(self.index) + u'\n'
+ delta_to_string(self.start)
+ ' --> '
+ delta_to_string(self.end) + u'\n'
+ u''.join(self.lines))
srt_file = open("foo.srt")
entries = []
entry = []
for line in srt_file:
if options.decode:
line = line.decode(options.decode)
if line == u'\n':
entries.append(Srt_entry(entry))
entry = []
else:
entry.append(line)
srt_file.close()
splits = [s.strip() for s in re.split(r'\n\s*\n', text) if s.strip()]
regex = re.compile(r'''(?P<index>\d+).*?(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*.*?\s*(?P<text>.*)''', re.DOTALL)
for s in splits:
r = regex.search(s)
print r.groups()
Here's a snippet I wrote which converts SRT files into dictionaries:
import re
def srt_time_to_seconds(time):
split_time=time.split(',')
major, minor = (split_time[0].split(':'), split_time[1])
return int(major[0])*1440 + int(major[1])*60 + int(major[2]) + float(minor)/1000
def srt_to_dict(srtText):
subs=[]
for s in re.sub('\r\n', '\n', srtText).split('\n\n'):
st = s.split('\n')
if len(st)>=3:
split = st[1].split(' --> ')
subs.append({'start': srt_time_to_seconds(split[0].strip()),
'end': srt_time_to_seconds(split[1].strip()),
'text': '<br />'.join(j for j in st[2:len(st)])
})
return subs
Usage:
import srt_to_dict
with open('test.srt', "r") as f:
srtText = f.read()
print srt_to_dict(srtText)

Categories

Resources