aligning, spacing in python 3 - python

What can I do to align all columns in this code?, is that correct or...?
import urllib.request
from re import findall
def determinarLlegadas(numero):
llegadas = urllib.request.urlopen("http://...")
llegadas = str (llegadas.read())
llegadas = findall ('<font color="Black" size="3">(.+?)</font>',llegadas)
print ('\t','VUELO ','\t','AEROLINEA','\t','PROCEDENCIA','\t','FECHA ','\t',' HORA ','\t','ESTADO','\t','PUERTA')
a = 0
numero = numero * 7
while numero > a:
print ('\t',llegadas[a+0],'\t',llegadas[a+1],'\t',llegadas[a+3],'\t',llegadas[a+3],'\t',llegadas[a+4],'\t',llegadas[a+5],'\t',llegadas[a+6])
a = a + 7

Don't use tabs, use string formatting.
...
print("{:12}{:12}{:12}{:12}{:12}{:12}{:12}".format(
"VUELO","AEROLINEA","PROCEDENCIA","FECHA","HORA","ESTADO","PUERTA"))
print("{:12}{:12}{:12}{:12}{:12}{:12}{:12}".format(*llegadas))
Change the 12 to the maximum field size for each column, and you're golden.
In fact, though it's less readable:
COLSIZE = 12
# Maybe COLSIZE = max(map(len,llegadas))+1
NUMCOLS = 7
formatstring = "{}{}{}".format("{:",COLSIZE,"}")*NUMCOLS
# {:COLSIZE}*NUMCOLS
headers = ["VUELO","AEROLINEA","PROCEDENCIA","FECHA","HORA","ESTADO","PUERTA"]
print(formatstring.format(*headers))
print(formatstring.format(*llegadas))

Related

How to get rid of the rest of the text after getting the results I want?

import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
coauthors_dict = {}
url_str = ('https://api.semanticscholar.org/graph/v1/author/47490276?fields=name,papers.authors')
respons = urllib.request.urlopen(url_str)
text = respons.read().decode()
for line in respons:
print(line.decode().rstip())
data = json.loads(text)
print(type(data))
print(list(data.keys()))
print(data["name"])
print(data["authorId"])
name = []
for lines in data["papers"]:
for authors in lines["authors"]:
name.append(authors.get("name"))
print(name)
count = dict()
names = name
for i in names:
if i not in count:
count[i] = 1
else:
count[i] += 1
print(count)
c = Counter(count)
top = c.most_common(10)
print(top)
return coauthors_dict
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc.items(), key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
This is how my code looks this far, there are no error. I need to get rid of the rest of the text when I run it, so it should look like this:
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
I have tried using rstrip and split on my 'c' variable but it doesn't work. Im only allowed importing what I already have imported and must use the link which is included.
Tips on simplifying or bettering the code is also appreciated!
("Extend the program below so that it prints the names of the top-10 coauthors together with the numbers of the coauthored publications")
From what I understand you are not quite sure where your successful output originates from. It is not the 5 lines at the end.
Your result is printed by the print(top) on line 39. This top variable is what you want to return from the function, as the coauthors_dict you are currently returning never actually gets any data written to it.
You will also have to slightly adjust your sorted(...) as you now have a list and not a dictionary, but you should then get the correct result.
If I understand correctly you are wanting this function to return a count of each distinct co-author (excluding the author), which it seems like you already have in your count variable, which you don't return. The variable you DO return is empty.
Instead consider:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
#The statement above can be written long-hand like:
#names=[]
#for l in data["papers"]:
# for a in l["authors"]:
# if a['authorId'] != author_id:
# names.append(a.get("name"))
return list(Counter(names).items())
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc, key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
You might also consider moving the top N logic into the function as an optional paramter:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id, top=0):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
name_count = list(Counter(names).items())
top = top if top!=0 else len(name_count)
return sorted(name_count, key=lambda x: x[1], reverse=True)[:top]
author_id = '47490276'
for auth in count_coauthors(author_id, top=10):
print(auth)

Attempting to Traverse a search API Collecting results in Python 3.9

I am attempting to yield all results from a search api.
There are 3 key cases:
Case 1: There are 0 results yielded
Case 2: There are 1-9 results yielded
Case 3: There are 10 results yielded
In both cases 1 and 2, we can conclude that there are no results deeper. i.e if we searched for "iiiiia" and it yields 0 then that means there are no results for "iiiiia" and no results deeper than "iiiiia". If "iiiiia" yields 1-9 results, then we can concluded "iiiiia" yields 109 results and no results deeper.
In case 3, if we search for "iiiiia" and it yields 10, we can conclude that "iiiiia" has 10 results, and there may or may not be results deeper as 10 means 10+ results possible. In this case we would have to move one layer down the chain and traverse through those results. i.e "iiiiia0"-"iiiiiaz" and if any of those yield 10 we would also have to go one layer deeper.
an example of how this may look:
a
a0
a00
a000
a0000
a0001
...
a000z
a001
a002
a003
a0030
...
a0034
a00340
...
a0034z
a0035...
Here is my attempted code:
import json
from time import sleep
import urllib3
import requests
import re
import random
import sys
import numpy
# Saving the reference of the standard output
original_stdout = sys.stdout
header = {}
link = '/search?query='
http = requests.session()
fileName = '/output.txt'
hasCompleted = 0
number = '11'
def search(target):
global targetList
global link
global header
global http
resp = http.request('GET',link+target,headers=header)
match = re.findall('"key":"(.+?)","',resp.text)
if len(match) == 10:
return False
if len(match) == 0:
return " "
elif len(match) < 10:
return resp.text
def treeSearch():
global fileName
global number
global hasCompleted
if hasCompleted == 1:
new = (int(int(number, 36) / 36) + 1)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 0
if hasCompleted == 0:
x = 0
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
while x < 37:
new = int(int(number, 36) + 1)
new = numpy.base_repr(new, 36)
number = new
result = search(number)
print(number)
if result:
with open(fileName, 'a+') as f:
sys.stdout = f
print(result)
sys.stdout = original_stdout
x = x + 1
else:
treeSearch()
temp = number
number = (int(int(number, 36) / 36) + 1) #maybe not + 1
print(number)
number = numpy.base_repr(number, 36)
print(number)
result = search(number)
if not result == " ":
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 1
treeSearch()
Here is my output:
111
1111
11111
111111
1111111
11111111
111111111
1111111111
11111111111
111111111111
1111111111111
11111111111111
111111111111111
1111111111111111
1111111111111112
1111111111111113
1111111111111114
1111111111111115
1111111111111116
1111111111111117
1111111111111118
1111111111111119
111111111111111A
111111111111111B
111111111111111C
111111111111111D
111111111111111E
111111111111111F
111111111111111G
111111111111111H
111111111111111I
111111111111111J
111111111111111K
111111111111111L
111111111111111M
111111111111111N
111111111111111O
111111111111111P
111111111111111Q
111111111111111R
111111111111111S
111111111111111T
111111111111111U
111111111111111V
111111111111111W
111111111111111X
111111111111111Y
111111111111111Z
1111111111111120
1111111111111121
6316397706306666889217
11111111110QR5T
11111111110QR5U
11111111110QR5V
11111111110QR5W
11111111110QR5X
11111111110QR5Y
11111111110QR5Z
11111111110QR60
11111111110QR61
11111111110QR62
11111111110QR63
11111111110QR64
11111111110QR65
11111111110QR66
11111111110QR67
11111111110QR68
11111111110QR69
11111111110QR6A
11111111110QR6B
11111111110QR6C
11111111110QR6D
11111111110QR6E
11111111110QR6F
11111111110QR6G
11111111110QR6H
11111111110QR6I
11111111110QR6J
11111111110QR6K
11111111110QR6L
11111111110QR6M
11111111110QR6N
11111111110QR6O
11111111110QR6P
11111111110QR6Q
11111111110QR6R
11111111110QR6S
11111111110QR6T
11111111110QR6U
175455491841851850753
11111111110L4X
11111111110L4Y
11111111110L4Z
11111111110L50
11111111110L51
11111111110L52
11111111110L53
11111111110L54
11111111110L55
11111111110L56
11111111110L57
11111111110L58
11111111110L59
11111111110L5A
11111111110L5B
11111111110L5C
11111111110L5D
11111111110L5E
11111111110L5F
11111111110L5G
11111111110L5H
11111111110L5I
11111111110L5J
11111111110L5K
11111111110L5L
11111111110L5M
11111111110L5N
11111111110L5O
11111111110L5P
11111111110L5Q
11111111110L5R
11111111110L5S
11111111110L5T
11111111110L5U
11111111110L5V
11111111110L5W
11111111110L5X
11111111110L5Y
4873763662273662977
11111111110XT
11111111110XU
11111111110XV
11111111110XW
11111111110XX
11111111110XY
11111111110XZ
11111111110Y0
11111111110Y1
11111111110Y2
11111111110Y3
11111111110Y4
11111111110Y5
11111111110Y6
11111111110Y7
11111111110Y8
11111111110Y9
11111111110YA
11111111110YB
11111111110YC
11111111110YD
11111111110YE
11111111110YF
11111111110YG
11111111110YH
11111111110YI
11111111110YJ
11111111110YK
11111111110YL
11111111110YM
11111111110YN
11111111110YO
11111111110YP
11111111110YQ
11111111110YR
11111111110YS
11111111110YT
11111111110YU
135382323952046193
11111111110X
11111111110Y
11111111110Z
111111111110
111111111111
111111111121
111111111122
111111111123
111111111124
111111111125
111111111126
111111111127
111111111128
111111111129
11111111112A
11111111112B
11111111112C
11111111112D
11111111112E
11111111112F
11111111112G
11111111112H
11111111112I
11111111112J
11111111112K
11111111112L
11111111112M
11111111112N
11111111112O
11111111112P
11111111112Q
11111111112R
11111111112S
11111111112T
11111111112U
11111111112V
11111111112W
11111111112X
11111111112Y
11111111112Z
111111111130
111111111131
3760620109779064
11111111114
111111111141
111111111142
111111111143
111111111144
111111111145
111111111146
111111111147
111111111148
111111111149
11111111114A
11111111114B
11111111114C
11111111114D
11111111114E
11111111114F
11111111114G
11111111114H
11111111114I
11111111114J
11111111114K
11111111114L
11111111114M
11111111114N
11111111114O
11111111114P
11111111114Q
11111111114R
11111111114S
11111111114T
11111111114U
11111111114V
11111111114W
11111111114X
11111111114Y
3760620109779066
11111111116
11111111117
11111111118
11111111119
1111111111A
1111111111B
1111111111C
1111111111D
1111111111E
1111111111F
1111111111G
1111111111H
1111111111I
1111111111J
1111111111K
1111111111L
1111111111M
1111111111N
1111111111O
1111111111P
1111111111Q
1111111111R
1111111111S
1111111111T
1111111111U
1111111111V
1111111111W
1111111111X
1111111111Y
1111111111Z
11111111120
11111111121
11111111122
11111111123
11111111124
11111111125
11111111126
11111111127
104461669716087
1111111113
1111111114
1111111115
1111111116
1111111117
1111111118
1111111119
111111111A
111111111B
111111111C
111111111D
111111111E
111111111F
111111111G
111111111H
111111111I
111111111J
111111111K
111111111L
111111111M
111111111N
111111111O
111111111P
111111111Q
111111111R
111111111S
111111111T
111111111U
111111111V
111111111W
111111111X
111111111Y
111111111Z
1111111120
1111111121
1111111122
1111111123
1111111124
2901713047671
111111113
111111114
111111115
111111116
111111117
111111118
111111119
11111111A
11111111B
11111111C
11111111D
11111111E
11111111F
11111111G
11111111H
11111111I
11111111J
11111111K
11111111L
11111111M
11111111N
11111111O
11111111P
11111111Q
11111111R
11111111S
11111111T
11111111U
11111111V
11111111W
11111111X
11111111Y
11111111Z
111111120
111111121
111111122
111111123
111111124
80603140215
11111113
11111114
11111115
11111116
11111117
11111118
11111119
1111111A
1111111B
1111111C
1111111D
1111111E
1111111F
1111111G
1111111H
1111111I
1111111J
1111111K
1111111L
1111111M
1111111N
1111111O
1111111P
1111111Q
1111111R
1111111S
1111111T
1111111U
1111111V
1111111W
1111111X
1111111Y
1111111Z
11111120
11111121
11111122
11111123
11111124
2238976119
1111113
11111131
11111132
11111133
11111134
11111135
11111136
11111137
11111138
11111139
1111113A
1111113B
1111113C
1111113D
1111113E
1111113F
1111113G
1111113H
1111113I
1111113J
1111113K
1111113L
1111113M
1111113N
1111113O
1111113P
1111113Q
1111113R
1111113S
1111113T
1111113U
1111113V
1111113W
1111113X
1111113Y
1111113Z
11111140
11111141
2238976121
1111115
11111151
11111152
11111153
11111154
11111155
11111156
11111157
11111158
11111159
1111115A
1111115B
1111115C
1111115D
1111115E
1111115F
1111115G
1111115H
1111115I
1111115J
1111115K
1111115L
1111115M
1111115N
1111115O
1111115P
1111115Q
1111115R
1111115S
1111115T
1111115U
1111115V
1111115W
1111115X
1111115Y
1111115Z
11111160
11111161
2238976123
1111117
11111171
11111172
11111173
11111174
11111175
11111176
11111177
11111178
11111179
1111117A
1111117B
1111117C
1111117D
1111117E
1111117F
1111117G
1111117H
1111117I
1111117J
1111117K
1111117L
1111117M
1111117N
1111117O
1111117P
1111117Q
1111117R
1111117S
1111117T
1111117U
1111117V
1111117W
1111117X
1111117Y
1111117Z
11111180
11111181
2238976125
1111119
111111A
111111B
111111C
111111D
111111E
111111F
111111G
111111H
111111I
111111J
111111K
111111L
111111M
111111N
111111O
111111P
111111Q
111111R
111111S
111111T
111111U
111111V
111111W
111111X
111111Y
111111Z
1111120
1111121
1111122
1111123
1111124
1111125
1111126
1111127
1111128
1111129
111112A
62193783
111113
1111131
1111132
1111133
1111134
1111135
1111136
1111137
1111138
1111139
111113A
111113B
111113C
111113D
111113E
111113F
111113G
111113H
111113I
111113J
111113K
111113L
111113M
111113N
111113O
111113P
111113Q
111113R
111113S
111113T
111113U
111113V
111113W
111113X
111113Y
111113Z
1111140
1111141
62193785
111115
My code traverses in only deeper once then comes out, I will keep working on my code however I am hoping to find an easier solution or possibly a library that can perform this style of search. Thanks!

Writing in file with specific format

def sauvegarder_canaux(self, nom_fichier:str) is the method giving me a problem when the file saves it only writes in this format:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
I need it to be like this:
5 : TQS : Télévision Quatres-saisons : 0.0 $ extra
This is the code that I have for now:
from canal import Canal
from forfait_tv import ForfaitTV
from abonne import Abonne
#============= Classe ===========================
class Distributeur :
"""
Description :
===========
Cette classe gère les listes de canaux, de forfaits (et plus tard
d'abonné).
Données membres privées :
======================
__canaux # [Canal] Liste de canaux existants
__forfaits # [ForfaitTV] Liste des forfaits disponibles
"""
#----------- Constructeur -----------------------------
def __init__(self):
self.__canaux = None
self.__forfaits = None
#code
self.__canaux = [] #list
self.__forfaits = [] #list
#----------- Accesseurs/Mutateurs ----------------------
def ajouter_canal(self,un_canal:Canal):
self.__canaux.append(un_canal)
def chercher_canal (self,p_poste:int):
i=0
postex = None
poste_trouve=None
for i in range(0,len(self.__canaux),1):
postex=self.__canaux[i]
if postex.get_poste()== p_poste:
poste_trouve=postex
return print(poste_trouve)
def telecharger_canaux(self,nom_fichier:str):
fichierCanaux = open(nom_fichier, "r")
for line in fichierCanaux:
eleCanal = line.strip(" : ")
canal = Canal(eleCanal[0],eleCanal[1],eleCanal[2],eleCanal[3])
self.__canaux.append(canal)
return canal
def sauvegarder_canaux(self, nom_fichier:str):
fichCanaux = open(nom_fichier,"w")
for i in self.__canaux:
fichCanaux.write(str(i) + "\n")
fichCanaux.close()
You need only to edit the string before you write it. The string.replace command is your friend. Perhaps ...
for i in self.__canaux:
out_line = str(i)
for char in "-(,":
out_line = out_line.replace(char, ':')
fichCanaux.write(out_line + "\n")
If removing the accents is okay, you can normalize the text to NFD with unicodedata, then find the segments of interest, modify them with the desired formatting, and replace them with the formatted segments using regex:
import unicodedata
import re
def format_string(test_str):
# normalize accents
test_str = test_str.decode("UTF-8")
test_str = unicodedata.normalize('NFD', test_str).encode('ascii', 'ignore')
# segment patterns
segment_1_ptn = re.compile(r"""[0-9]*(\s)* # natural number
[-](\s)* # dash
(\w)*(\s)* # acronym
""",
re.VERBOSE)
segment_2_ptn = re.compile(r"""(\w)*(\s)* # acronym
(\() # open parenthesis
((\w*[-]*)*(\s)*)* # words
""",
re.VERBOSE)
segment_3_ptn = re.compile(r"""((\w*[-]*)*(\s)*)* # words
(,)(\s)* # comma
[0-9]*(.)[0-9]*(\s)*(\$)(\s) # real number
""",
re.VERBOSE)
# format data
segment_1_match = re.search(segment_1, test_str).group()
test_str = test_str.replace(segment_1_match, " : ".join(segment_1_match.split("-")))
segment_2_match = re.search(segment_2, test_str).group()
test_str = test_str.replace(segment_2_match, " : ".join(segment_2_match.split("(")))
segment_3_match = re.search(segment_3, test_str).group()
test_str = test_str.replace(segment_3_match, " : ".join(segment_3_match.split(",")))[:-1]
test_str = " : ".join([txt.strip() for txt in test_str.split(":")])
return test_str
Then you can call this function within sauvegarder_canaux
def sauvegarder_canaux(self, nom_fichier:str):
with open(nom_fichier, "w") as fichCanaux
for i in self.__canaux:
fichCanaux.write(format_string(str(i)) + "\n")
You can also add format_string as a method within your Distributeur class.
Example input:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
Example output:
5 : TQS : Television Quatres-saisons : 0.0 $ extra

Displaying prices

I'm getting prices in different currencies and want to display Brazilian R$
My formatting doesn't work and the display looks like this:
Price: 1.15..000.,00 R$
For good flexibility I've stored the price as a string: price=db.StringProperty(verbose_name="price")
I tried to implement my own filter and it didn't work:
{{ ad.price|separate }} R$
def separate(n, sep='.'):
ln = list(str(n))
ln.reverse()
newn = []
while len(ln) > 3:
newn.extend(ln[:3])
newn.append(sep)
ln = ln[3:]
newn.extend(ln)
newn.reverse()
return "".join(newn)
Can you help me? Should I just remove the filter? Should I enforce some regex to the input instead? A link to my site is http://www.koolbusiness.com/servead/4252196
UPDATE: I'm considering using something like one of these filters:
import locale
locale.setlocale(locale.LC_ALL, '')
def currency(value): # doesn't work
locale.setlocale(locale.LC_ALL, '')
return locale.currency(value, grouping=True)
register.filter(currency)
def currencyWithoutUsingLocale(value): # needs adjustment
value=float(value)
symbol = '$'
thousand_sep = ''
decimal_sep = ''
# try to use settings if set
try:
symbol = settings.CURRENCY_SYMBOL
except AttributeError:
pass
try:
thousand_sep = settings.THOUSAND_SEPARATOR
decimal_sep = settings.DECIMAL_SEPARATOR
except AttributeError:
thousand_sep = ','
decimal_sep = '.'
intstr = str(int(value))
f = lambda x, n, acc=[]: f(x[:-n], n, [(x[-n:])]+acc) if x else acc
intpart = thousand_sep.join(f(intstr, 3))
return "%s%s%s%s" % (symbol, intpart, decimal_sep, ("%0.2f" % value)[-2:])
register.filter(currencyWithoutUsingLocale)
Storing the price as a string is the first problem. It should be a Decimal. If you look at the Python standard library documentation for Decimal, you will see this http://docs.python.org/library/decimal.html#recipes
That moneyfmt recipe should do what you want
Currency formatting is a feature of locale.
http://docs.python.org/library/locale.html#locale.currency
You can use locale._override_localeconv dict to set your own overwrites:
>>> import locale
>>> locale.setlocale(locale.LC_MONETARY, 'pt_BR.UTF-8')
>>> locale.currency(1234.56)
'1234,56 R$'
>>> locale._override_localeconv.update({'p_cs_precedes': 1, 'n_cs_precedes': 1})
>>> locale.currency(1234.56)
'R$ 1234,56'
You can use this locale._override_localeconv.update({'p_cs_precedes': 1, 'n_cs_precedes': 1}) line right after the import locale, if you want.

Parsing srt subtitles

I want to parse srt subtitles:
1
00:00:12,815 --> 00:00:14,509
Chlapi, jak to jde s
těma pracovníma světlama?.
2
00:00:14,815 --> 00:00:16,498
Trochu je zesilujeme.
3
00:00:16,934 --> 00:00:17,814
Jo, sleduj.
Every item into structure. With this regexs:
A:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*?)', re.DOTALL)
B:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*)', re.DOTALL)
And this code:
for i in Subtitles.RE_ITEM.finditer(text):
result.append((i.group('index'), i.group('start'),
i.group('end'), i.group('text')))
With code B I have only one item in array (because of greedy .*) and with code A I have empty 'text' because of no-greedy .*?
How to cure this?
Thanks
Why not use pysrt?
I became quite frustrated with srt libraries available for Python (often because they were heavyweight and eschewed language-standard types in favour of custom classes), so I've spent the last year or so working on my own srt library. You can get it at https://github.com/cdown/srt.
I tried to keep it simple and light on classes (except for the core Subtitle class, which more or less just stores the SRT block data). It can read and write SRT files, and turn noncompliant SRT files into compliant ones.
Here's a usage example with your sample input:
>>> import srt, pprint
>>> gen = srt.parse('''\
... 1
... 00:00:12,815 --> 00:00:14,509
... Chlapi, jak to jde s
... těma pracovníma světlama?.
...
... 2
... 00:00:14,815 --> 00:00:16,498
... Trochu je zesilujeme.
...
... 3
... 00:00:16,934 --> 00:00:17,814
... Jo, sleduj.
...
... ''')
>>> pprint.pprint(list(gen))
[Subtitle(start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), index=1, proprietary='', content='Chlapi, jak to jde s\ntěma pracovníma světlama?.'),
Subtitle(start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), index=2, proprietary='', content='Trochu je zesilujeme.'),
Subtitle(start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), index=3, proprietary='', content='Jo, sleduj.')]
The text is followed by an empty line, or the end of file. So you can use:
r' .... (?P<text>.*?)(\n\n|$)'
Here's some code I had lying around to parse SRT files:
from __future__ import division
import datetime
class Srt_entry(object):
def __init__(self, lines):
def parsetime(string):
hours, minutes, seconds = string.split(u':')
hours = int(hours)
minutes = int(minutes)
seconds = float(u'.'.join(seconds.split(u',')))
return datetime.timedelta(0, seconds, 0, 0, minutes, hours)
self.index = int(lines[0])
start, arrow, end = lines[1].split()
self.start = parsetime(start)
if arrow != u"-->":
raise ValueError
self.end = parsetime(end)
self.lines = lines[2:]
if not self.lines[-1]:
del self.lines[-1]
def __unicode__(self):
def delta_to_string(d):
hours = (d.days * 24) \
+ (d.seconds // (60 * 60))
minutes = (d.seconds // 60) % 60
seconds = d.seconds % 60 + d.microseconds / 1000000
return u','.join((u"%02d:%02d:%06.3f"
% (hours, minutes, seconds)).split(u'.'))
return (unicode(self.index) + u'\n'
+ delta_to_string(self.start)
+ ' --> '
+ delta_to_string(self.end) + u'\n'
+ u''.join(self.lines))
srt_file = open("foo.srt")
entries = []
entry = []
for line in srt_file:
if options.decode:
line = line.decode(options.decode)
if line == u'\n':
entries.append(Srt_entry(entry))
entry = []
else:
entry.append(line)
srt_file.close()
splits = [s.strip() for s in re.split(r'\n\s*\n', text) if s.strip()]
regex = re.compile(r'''(?P<index>\d+).*?(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*.*?\s*(?P<text>.*)''', re.DOTALL)
for s in splits:
r = regex.search(s)
print r.groups()
Here's a snippet I wrote which converts SRT files into dictionaries:
import re
def srt_time_to_seconds(time):
split_time=time.split(',')
major, minor = (split_time[0].split(':'), split_time[1])
return int(major[0])*1440 + int(major[1])*60 + int(major[2]) + float(minor)/1000
def srt_to_dict(srtText):
subs=[]
for s in re.sub('\r\n', '\n', srtText).split('\n\n'):
st = s.split('\n')
if len(st)>=3:
split = st[1].split(' --> ')
subs.append({'start': srt_time_to_seconds(split[0].strip()),
'end': srt_time_to_seconds(split[1].strip()),
'text': '<br />'.join(j for j in st[2:len(st)])
})
return subs
Usage:
import srt_to_dict
with open('test.srt', "r") as f:
srtText = f.read()
print srt_to_dict(srtText)

Categories

Resources