For-loop does not iterate to next element in a file - python

I have a problem with my two for-loops in this code:
def batchm():
searchFile = sys.argv[1]
namesFile = sys.argv[2]
writeFile = sys.argv[3]
countDict = {}
with open(searchFile, "r") as nlcfile:
with open(namesFile, "r") as namesList:
with open(writeFile, "a") as wfile:
for name in namesList:
for line in nlcfile:
if name in line:
res = line.split("\t")
countValue = res[0]
countKey = res[-1]
countDict[countKey] = countValue
countDictMax = sorted(countDict, key = lambda x: x[1], reverse = True)
print(countDictMax)
The loop is iterating over this:
namesList:
Greene
Donald
Donald Duck
MacDonald
.
.
.
nlcfile:
123 1999–2000 Northampton Town F.C. season Northampton Town
5 John Simpson Kirkpatrick
167 File talk:NewYorkRangers1940s.png talk
234 Parshu Ram Sharma(Raj Comics) Parshuram Sharma
.
.
.
What I get looks like this:
['Lyn Greene\n', 'Rydbergia grandiflora (Torrey & A. Gray in A. Gray) E. Greene\n', 'Tyler Greene\n', 'Ty Greene\n' ..... ]
and this list appears 48 times, which also happens to be the number of lines in namesList.
Desired output:
("string from namesList" -> "record with highest number in nlcfile")
Greene -> Ly Greene
Donald -> Donald Duck
.
.
.
I think that the two for-loops don't iterate the right way. But I have no clue, why.
Can anyone see, where the problem is?
Thank you very much!

Related

Remove item from list Python

I have a document containg a list. When I write the document it prints
['γύρισε στις φυλακές δομοκού κουφοντίνας huffpost greece']
['australia']
[]
['brasil']
[]
['canada']
[]
['españa']
[]
What I want is to remove the [] characters. So far I've done the following.
for file_name in list_of_files:
with open(file_name, 'r', encoding="utf-8") as inf:
lst = []
for line in inf:
#special characters removal
line = line.lower()
line = re.sub('\W+',' ', line )
line = word_tokenize(line)
#stopwords removal
line = ' '.join([word for word in line if word not in stopwords_dict])
line = line.split('\n')
line = list(filter(None, line))
lst.append(line)
inf.close()
Which removes some '' from inside the empty [], which seems reasonable. I have tried several approaches such as strip, remove() and [x for x in strings if x] without success. I am rather inexperienced, what am I missing?
update:
the initial text looks like this
Εκτέλεσαν τον δημοσιογράφο Γιώργο Καραϊβάζ στον Άλιμο | HuffPost Greece
Australia
Brasil
Canada
España
France
Ελλάδα (Greece)
India
Italia
日本 (Japan)
한국 (Korea)
Québec (en français)
United Kingdom
United States
Ελλάδα (Greece)
Update:
And I am writing the list to a file like this
for line in lst:
outf.write("%s\n" % line)
outf.close()
It looks like you're writing the lists itself after you're appending all of the items in the file. If you want to print the items of a list in python without the surrounding '[' and ']', then just loop over each item and print like so:
for item in list:
outf.write("%s\n" % item)
and for a list of lists
for list in lists:
for item in list:
outf.write("%s\n" % item)
if your output line always contains just one of each '[' and ']' then you can get what's in between with something like
for line in lst:
open_split = line.split('[')
after_open = open_split[1] if len(open_split) > 0 else ""
closed_split = after_open.split(']')
in_between_brackets = closed_split[0]
outf.write("%s\n" % in_between_brackets)
A short hand fragile version of the above split method can be done like so:
for line in lst:
outf.write("%s\n" % line.split('[')[1].split(']')[0])
If the expected result from the clean is like this
γύρισε στις φυλακές δομοκού κουφοντίνας huffpost greece
australia
brasil
canada
españa
Then this code below would help you.
import re
with open('original.txt') as f:
data = f.read()
with open('cleaned.txt', 'w') as f:
# Remove chars like [, ] and '
result = re.sub("\[|\]|'", '', data)
# Remove the extra lines (replace 2 \n by 1).
result = re.sub('\\n\\n', '\\n', result)
f.write(result)
After you remove the stop words you likely don't want to:
line = line.split('\n')
line = list(filter(None, line))
You likely want to inspect what is left and just continue if it is "nothing"
import re # mocking for NLTK
stopwords_dict = {
"huffpost": True
}
text_in = '''
Εκτέλεσαν τον δημοσιογράφο Γιώργο Καραϊβάζ στον Άλιμο | HuffPost Greece
Australia
Brasil
Canada
España
France
Ελλάδα (Greece)
India
Italia
日本 (Japan)
한국 (Korea)
Québec (en français)
United Kingdom
United States
Ελλάδα (Greece)
'''
'''
This emulates NLTK.word_tokenize
'''
def word_tokenize(text):
return re.sub(r'[^\w\s]', '', text).split()
lst = []
for line in text_in.splitlines():
line = line.lower()
line = re.sub('\W+',' ', line )
line_tokens = word_tokenize(line)
line_tokens = [token for token in line_tokens if token not in stopwords_dict]
# after cleaning a line, if there is nothing left skip
if not line_tokens:
continue
line = ' '.join(line_tokens)
lst.append(line)
with open("file_out.txt", "w", encoding='utf-8') as file_out:
for line in lst:
file_out.write("%s\n" % line)
This is give you a file with the contents of:
εκτέλεσαν τον δημοσιογράφο γιώργο καραϊβάζ στον άλιμο greece
australia
brasil
canada
españa
france
ελλάδα greece
india
italia
日本 japan
한국 korea
québec en français
united kingdom
united states
ελλάδα greece
Which is what you are hoping for (I think).
I managed to remove the empty lines by channging:
line = list(filter(None, line))
lst.append(line)
to
lst.append(line)
lst = list(filter(None, lst))
Apologies, it was a trivial mistake, thank you for your answers.

Join whole word by its Tag Python

let say i have this sentences:
His/O name/O is/O Petter/Name Jack/Name and/O his/O brother/O name/O is/O
Jonas/Name Van/Name Dame/Name
How can i get result like this:
Petter Jack, Jonas Van Dame.
So far i've already tried this, but still its just join 2 word :
import re
pattern = re.compile(r"\w+\/Name)
sent = sentence.split()
for i , w in sent:
if pattern.match(sent[i]) != None:
if pattern.match(sent[i+1]) != None:
#....
#join sent[i] and sent[i+1] element
#....
Try something like this
pattern = re.compile(r"((\w+\/Name\s*)+)")
names = pattern.findall(your_string)
for name in names:
print(''.join(name[0].split('/Name')))
I'm thinking about a two-phase solution
r = re.compile(r'\w+\/Name(?:\ \w+\/Name)*')
result = r.findall(s)
# -> ['Petter/Name Jack/Name', 'Jonas/Name Van/Name Dame/Name']
for r in result:
print(r.replace('/Name', ''))
# -> Petter Jack
# -> Jonas Van Dame

Data Analysis using Python

I have 2 CSV files. One with city name, population and humidity. In second cities are mapped to states. I want to get state-wise total population and average humidity. Can someone help? Here is the example:
CSV 1:
CityName,population,humidity
Austin,1000,20
Sanjose,2200,10
Sacramento,500,5
CSV 2:
State,city name
Ca,Sanjose
Ca,Sacramento
Texas,Austin
Would like to get output(sum population and average humidity for state):
Ca,2700,7.5
Texas,1000,20
The above solution doesn't work because dictionary will contain one one key value. i gave up and finally used a loop. below code is working, mentioned input too
csv1
state_name,city_name
CA,sacramento
utah,saltlake
CA,san jose
Utah,provo
CA,sanfrancisco
TX,austin
TX,dallas
OR,portland
CSV2
city_name population humidity
sacramento 1000 1
saltlake 300 5
san jose 500 2
provo 100 7
sanfrancisco 700 3
austin 2000 4
dallas 2500 5
portland 300 6
def mapping_within_dataframe(self, file1,file2,file3):
self.csv1 = file1
self.csv2 = file2
self.outcsv = file3
one_state_data = 0
outfile = csv.writer(open('self.outcsv', 'w'), delimiter=',')
state_city = read_csv(self.csv1)
city_data = read_csv(self.csv2)
all_state = list(set(state_city.state_name))
for one_state in all_state:
one_state_cities = list(state_city.loc[state_city.state_name == one_state, "city_name"])
one_state_data = 0
for one_city in one_state_cities:
one_city_data = city_data.loc[city_data.city_name == one_city, "population"].sum()
one_state_data = one_state_data + one_city_data
print one_state, one_state_data
outfile.writerows(whatever)
def output(file1, file2):
f = lambda x: x.strip() #strips newline and white space characters
with open(file1) as cities:
with open(file2) as states:
states_dict = {}
cities_dict = {}
for line in states:
line = line.split(',')
states_dict[f(line[0])] = f(line[1])
for line in cities:
line = line.split(',')
cities_dict[f(line[0])] = (int(f(line[1])) , int(f(line[2])))
for state , city in states_dict.iteritems():
try:
print state, cities_dict[city]
except KeyError:
pass
output(CSV1,CSV2) #these are the names of the files
This gives the output you wanted. Just make sure the names of cities in both files are the same in terms of capitalization.

html scraping using python topboxoffice list from imdb website

URL: http://www.imdb.com/chart/?ref_=nv_ch_cht_2
I want you to print top box office list from above site (all the movies' rank, title, weekend, gross and weeks movies in the order)
Example output:
Rank:1
title: godzilla
weekend:$93.2M
Gross:$93.2M
Weeks: 1
Rank: 2
title: Neighbours
This is just a simple way to extract those entities by BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = "http://www.imdb.com/chart/?ref_=nv_ch_cht_2"
data = urllib2.urlopen(url).read()
page = BeautifulSoup(data, 'html.parser')
rows = page.findAll("tr", {'class': ['odd', 'even']})
for tr in rows:
for data in tr.findAll("td", {'class': ['titleColumn', 'weeksColumn','ratingColumn']}):
print data.get_text()
P.S.-Arrange according to your will.
There is no need to scrape anything. See the answer I gave here.
How to scrape data from imdb business page?
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan Skarsgård, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long

Handling word index in text files

wordlist A: book jesus christ son david son abraham jacob judah his brothers perez amminadab
wordlist B: akwụkwọ jizọs kraịst nwa devid nwa ebreham jekọb juda ya ụmụnne pirez aminadab
file.txt A:
the book of the history of jesus christ , son of david , son of abraham :
abraham became father to isaac ; isaac became father to jacob ; jacob became father to judah and his brothers ;
file.txt B:
akwụkwọ nke kọrọ akụkọ banyere jizọs kraịst , nwa devid , nwa ebreham :
ebreham mụrụ aịzik ; aịzik amụọ jekọb ; jekọb amụọ juda na ụmụnne ya ndị ikom ;
I have 2 above word-lists (say A & B) of 2 diff. languages. Both contain word translation of each other in order. My task is to run these word-lists through 2 separate files.txt of both languages like word-list A through file.txt A and vice versa, then return a line for both txt files, each will contain the index numbers of both word-list where they were found on each line of the txt paired like:
2:1 7:6 8:7 10:9 12:10 14:12 16:13 [ 2:1 = 2 index of book in txt.file A and 1-akwụkwọ in txt.file B and so on]
1:1 11:6 13:8 17:10 19:12 20:13 [ 1:1 = 1 index of abraham in txt.file A and 1- ebreham in txt.file B and so on].
see codes below:
import sys
def wordlist(filename):
wordlist = []
with open(filename, 'rb') as f:
for line in f:
wordlist.append(line)
return wordlist
eng = []
for lines in open('eng_try.txt', 'rb'):
line = lines.strip()
eng.append(line)
igb = []
for lines in open('igb_try.txt', 'rb'):
line = lines.strip()
igb.append(line)
i = 0
while i < len(eng):
eng_igb_verse_pair = eng[i] + " " + igb[i]
line = eng_igb_verse_pair.strip().split()
for n in range(0, len(wordlist('eng_wordlist.txt'))):
eng_word = wordlist('eng_wordlist.txt').pop(n)
igb_word = wordlist('igb_wordlist.txt').pop(n)
if eng_word in line and igb_word in line:
print '{0} {1}:{2}'.format(i, line.index[eng_word], line.index[igb_word])
i += 1
This actually prints empty. I know my problem is in the last segment of the program. Can someone help. I am not that experienced python programmer. Apologies if I didn't construct my explanation well.
You mean something like this:
import sys
def checkLine(line_eng, line_igb):
eng_words = line_eng.split()
igb_words = line_igb.split()
for word in eng_words:
if word in eng:
igb_word = igb[eng.index(word)]
print "%d:%d" % ( eng_words.index(word)+1, igb_words.index(igb_word)+1),
def linelist(filename):
lineslist = []
for line in open(filename, 'rb'):
lineslist.append(line)
return lineslist
eng = []
for lines in open('eng_try.txt', 'rb'):
line = lines.strip()
for w in line.split():
eng.append(w)
igb = []
for lines in open('igb_try.txt', 'rb'):
line = lines.strip()
for w in line.split():
igb.append(w)
eng_lines = linelist("eng_wordlist.txt")
igb_lines = linelist("igb_wordlist.txt")
for n in range(0, len(eng_lines)):
print "%d. " % (n+1),
checkLine(eng_lines[n],igb_lines[n])
print
For your files i got result:
1. 2:1 7:6 8:7 10:9 12:10 10:9 16:13
2. 1:1 11:7 11:7 17:11 19:14 20:13
BR
Parasit Hendersson

Categories

Resources