Python, for loop - python

I have the following code:
def get_asset_info(asset_list):
import datetime
today = datetime.datetime.today()
day = today.strftime("%d")
for i in range( len( asset_list )):
raw_info = get_OHLC( asset_list[i], 15, get_server_time() )
info = raw_info['result'][asset_list[i]]
head = "time,open,high,low,close,vwap,volume,count"
formatted_info = ""
for i in range(len(info[0])):
formatted_info = formatted_info + info[0][i] + ","
file = open(asset_list[i]+"_"+day, "a")
file.write(head + "\n")
file.write(formatted_info)
file.close()
It is supposed to get some values, convert it into a string and write it to a file, dynamically generated.
It's not working like this and all the values are put in the same file.
If I change the last part of the code like the following, the files are generated:
formatted_info = str(info[0][0]) + "," + str(info[0][1]) + "," + str(info[0][2]) + "," + str(info[0][3]) + "," + str(info[0][4]) + "," + str(info[0][5]) + "," + str(info[0][6]) + "," + str(info[0][7])
file = open(asset_list[i]+"_"+day, "a")
file.write(head + "\n")
file.write(formatted_info)
file.close()
So the problem, as I can see, is in the for loop I create to generate my string, but there's no sense since the code that generates the file is not in the same loop.
Any ideas?

for i in range( len( asset_list )):
...
for i in range(len(info[0])):
...
# now what do you think i is now?
file = open(asset_list[i]+"_"+day, "a")
Changing second i to j should do the trick:
for i in range( len( asset_list )):
...
for j in range(len(info[0])):
formatted_info = formatted_info + info[0][j] + ","
file = open(asset_list[i]+"_"+day, "a")
or even better:
for i in range( len( asset_list )):
...
for piece in info[0]:
formatted_info = formatted_info + str(piece) + ","
file = open(asset_list[i]+"_"+day, "a")
or finally better:
for i in range( len( asset_list )):
...
formatted_info = ','.join(str(obj) for obj in info[0]) + ','
file = open(asset_list[i]+"_"+day, "a")

Related

Encountering "List Index out of Range" Exception while Web Scraping via Selenium

I'm scraping data for a data science project using Selenium, and I don't know why I get Index errors on the write-to-csv portion. When I print out the data as-is, the output looks normal.
Code below:
'''
driver = webdriver.Firefox(executable_path="/filepath/geckodriver.exe")
url = 'https://website.com'
driver.get(url)
with open('file.csv', 'w') as f:
f.write('Column1', 'Column2', 'Column3', '\n')
ids = driver.find_elements_by_xpath('//*[#class="id-name"]')
id_list = []
for i in range(50):
id_list.append(ids[i].text)
print(len(ids))
print(len(id_list))
print(id_list[0:50])
# Break up into batches to save memory
new_id_list = [id_list[i:i+5] for i in range(0,len(id_list),5)]
#time.sleep(1200)
for i in range(len(new_id_list)):
for j in range(len(new_id_list[i])):
url = 'http://www.website.com?id=' + str(id_list[j])
driver.get(url)
col1 = driver.find_elements_by_xpath('//*[#id="field-value-col_1"]/span/span')
col2 = driver.find_elements_by_xpath('//h1[#id="field-value-col_2"]')
col3 = driver.find_elements_by_xpath('//*[#id="field-value-col_3"]')
print(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
# This is where I get the error usually.
with open('bugzilla.csv', 'w') as f:
f.write(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
print('Batch of 5')
f.close()
'''
Here
print(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
you work with your id_list as with two-dimensional array while earlier you define it as
id_list = []
for i in range(50):
id_list.append(ids[i].text)
You probably meant: print(new_id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')

Appending to array using pool

I am trying to scrape data from soccerway.com and checking whether the page is a completed game/game to be played with each instance being written to a seperate csv file. I am running through 10,000 pages and so have written it using Pools. However, I am getting empty lists from the append function and cannot write anything to the csv files.
I tried writing straight to the file instead of list appending however this gave incomplete files
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os
fixturesA = []
linksA = []
statsA = []
def parse(url):
try:
#print(url)
delays = [0.25,0.5,0.75,1]
delay = np.random.choice(delays)
#time.sleep(delay)
#r = requests.get(url)
r = requests.get(url, timeout = 10)
soup = BeautifulSoup(r.content, "html.parser")
teams = soup.findAll('h3', attrs = {'class' : 'thick'})
homeTeam = teams[0].text.strip()
awayTeam = teams[2].text.strip()
middle = teams[1].text.strip()
dds = soup.findAll('dd')
date = dds[1].text.strip()
gameWeek = dds[2].text.strip()
if ':' not in middle:
middle = middle.split(" - ")
homeGoals = 0
awayGoals = 0
homeGoals = middle[0]
try:
awayGoals = middle[1]
except Exception as e:
homeGoals = "-1"
awayGoals = "-1"
matchGoals = int(homeGoals) + int(awayGoals)
if(matchGoals >= 0):
if(int(homeGoals) > 0 and int(awayGoals) > 0):
btts = "y"
else:
btts = "n"
halfTimeScore = dds[4].text.strip().split(" - ")
firstHalfHomeGoals = halfTimeScore[0]
firstHalfAwayConc = halfTimeScore[0]
firstHalfAwayGoals = halfTimeScore[1]
firstHalfHomeConc = halfTimeScore[1]
firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfTotalGoals = matchGoals - firstHalfTotalGoals
homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
homeTeamStarting = homeTeamContainers[2]
homeTeamBench = homeTeamContainers[3]
homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
homeTeamCards = homeTeamYellows + homeTeamReds
awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
awayTeamStarting = awayTeamContainers[2]
awayTeamBench = awayTeamContainers[3]
awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
awayTeamCards = awayTeamYellows + awayTeamReds
matchCards = homeTeamCards + awayTeamCards
try:
iframe = soup.findAll('iframe')
iframeSrc = iframe[1]['src']
url = 'https://us.soccerway.com/' + iframeSrc
c = requests.get(url,timeout = 10)
soupC = BeautifulSoup(c.content, "html.parser")
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
homeCorners = cornerContainer[0].text.strip()
awayCornersConc = homeCorners
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
awayCorners = cornerContainer[0].text.strip()
homeCornersConc = awayCorners
matchCorners = int(homeCorners) + int(awayCorners)
print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
return None
except Exception as e:
print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
return None
else:
fixturesA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + date + "\n")
linksA.append(url + "\n")
print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
return None
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
linksA.append(url + "\n")
print(url)
return None
stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')
with open('links.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
links = open('links.txt','w')
if __name__ == '__main__':
start_time = time.time()
p = Pool(20) # Pool tells how many at a time
records = p.map(parse, content)
p.terminate()
p.join()
print("--- %s seconds ---" % (time.time() - start_time))
I assume you are running Windows? Then the answer is that multi-processing in Windows creates copies instead of forks. So you have your main process with the lists and you get your working processes (from pool) with their own separate set of lists.
The workers most likely fill their list correctly, but the lists in the main-process don't get any data and so are staying empty. And the workers do not return anything. So, as you write your files in the main-process, you get empty files.
An easy way to solve this is creating pipes or queues between the main process and the workers to allow communication between the threads. You could also use shared arrays like they are provided by the multiprocessing class, but than you would need to know the length during creation.
see documentation: Multiprocessing
as pointed out by #RaJa you're not actually doing anything the parent/controlling process can see. the easiest is just to return values from the mapped function
for example, parse() could return tuple at the end like:
def parse(url):
# do work
return url, homeTeam, awayTeam, gameWeek, homeGoals, awayGoals # ...
then the parent process can receive the values and do useful things like saving them to a CSV file:
import csv
with Pool(20) as pool:
records = pool.map(parse, content)
with open('stats.csv', 'w') as fd:
out = csv.writer(fd)
out.writerow([
'url', 'hometeam', 'awayteam',
# and the remaining column names for the header
])
out.writerows(records)

Why are Lists causing problems

So I am working on a certain code to modify a text file. When I use this function individually, it works perfectly
TextRotation.rotTextC("cv.txt")
But when I use it in batch as a list like this
def files_LTXT(pathF):
return glob.glob(pathF + "*" + ".txt")
for i in range (len(listFileTXT)):
TextRotation.rotTextC(listFileTXT[i])
IT gives the following error:
File "C:\Users\Administrator\PycharmProjects\openCV\TextRotation.py", line
9, in rotLineC
0
valueObj = int(lineStr[c1])
0.472917 0.713281 0.845833 0.376563
IndexError: string index out of range
Function rotLineC is as follows:
def rotLineC(lineStr, c1):
if len(lineStr) > 2:
valueObj = int(lineStr[c1])
print(valueObj)
valueXC = float(lineStr[(c1+2):(c1+10)])
valueYC = float(lineStr[(c1+11):(c1+19)])
valueW = float(lineStr[(c1+20):(c1+28)])
valueH = float(lineStr[(c1+29):(c1+37)])
# print(valueXC)
# print(valueYC)
# print(valueW)
# print(valueH)
nValueXC = round(1 - valueYC, 6)
nValueYC = round(valueXC, 6)
nValueW = round(valueH, 6)
nValueH = round(valueW, 6)
rotString = str(int(valueObj)) + " " + str(nValueXC) + " " + \
str(nValueYC) + " " + str(nValueW) + " " + str(nValueH)
print(str(nValueXC) + " " + str(nValueYC) + " " + str(nValueW) + " " + str(nValueH))
print(rotString)
return rotString
This function works fine!
for i in range (len(listFileJPG)):
ImageRotation.rotImage(listFileJPG[i])
Mind to include the / to the end of the path! (I am assuming a UNIX environment here)
If the path is 'dev/my_pat', for example, your function will fail. The path must end with a /. You can it to your function:
...
if pathF[-1] != '/':
return glob.glob(pathF + "/*.txt")
...
Also, do not iterate using indices, use the pythonic way!
for file in listFileTXT(my_path):
TextRotation.rotTextC(file)

Script Loop through files in directory

I have the following code which creates the txt file I require from a shp.file with the data I need. I have a folder called profiles containing a few number of shape files named (profil1.shp, profil2.shp, profil3.shp etc.). I was wondering how to create a loop so that the script creates for each file a txt file with the same name (eg. for profil1.shp create profil1.txt, profil2.shp create profil2.txt and so on).
import ogr, os, sys, osr
os.chdir('..\profiles')
file = open('profil1.txt', 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open('profil1.shp', 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()
edit: the code is returning a Could not open file.Photo of the folder containing the files and their respective names. Safe to assume I am doing something wrong.
import ogr, os, sys, osr,os.path
os.chdir = ('C:\Users\Andrei\Desktop\profil3')
l = os.listdir('C:\Users\Andrei\Desktop\profil3')
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
file = open(s1, 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open(i, 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()
You can use os.listdir() to list the files and folders in the current directory.
This returns a list of all files in the current directory (or the directory given to it as parameter , if no parameter is specified it checks the current directory) .
Then you can check for files with the name ending with .shp using string.endswith() function and then use that to create your new files.
Example of a small portion -
import os , os.path
l = os.listdir()
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
At the end s1 would contain the file with extension as .txt .
Then you can do your logic on this file, and keep on doing like this.
Full code would look something like -
import ogr, os, sys, osr,os.path
os.chdir('..\profiles')
l = os.listdir()
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
file = open(s1, 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open(i, 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()
A better way of openning files, etc is using with statement. Look up its tutorial here.

Python - key error when using "if in dict"

I am receiving the following error when running a script to parse contents of an XML file.
if iteration.findtext("Iteration_query-def") in ecdict:
KeyError: 'XLOC_000434'
I was under the impression that using "if in dict" would mean that if the key is not found in the dictionary, the script will continue past the if statement and proceed with the rest of the code. Below is the problematic section of the code I am using. I realise this is quite a basic question, but I am unsure what else I can say, and I don't understand why I am receiving this error.
import xml.etree.ElementTree as ET
tree = ET.parse('507.FINAL_14.2.14_2_nr.out_fmt5.out')
blast_iteration = tree.find("BlastOutput_iterations")
for iteration in blast_iteration.findall("Iteration"):
query = iteration.findtext("Iteration_query-def").strip().strip("\n")
if query in score:
continue
if iteration.findtext("Iteration_message") == "No hits found":
if iteration.findtext("Iteration_query-def") in tair:
tairid = tair[iteration.findtext("Iteration_query-def")][0]
tairdes = tair[iteration.findtext("Iteration_query-def")][1]
else:
tairid = "-"
tairdes = "-"
goterms = ""
ecterms = ""
if iteration.findtext("Iteration_query-def") in godict:
for x in godict[iteration.findtext("Iteration_query-def")][:-1]:
goterms = goterms + x + ";"
goterms = goterms + godict[iteration.findtext("Iteration_query-def")][-1]
else:
goterms = "-"
if iteration.findtext("Iteration_query-def") in ecdict:
for x in ecdict[iteration.findtext("Iteration_query-def")][:-1]:
ecterms = ecterms + x + ";"
ecterms = ecterms + ecdict[iteration.findtext("Iteration_query-def")][-1]
else:
ecterms = "-"
if iteration.findtext("Iteration_query-def") in godescr:
desc = godescr[iteration.findtext("Iteration_query-def")]
else:
desc = "-"
n += 1
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
print p
Hope you can help,
Thanks.
edit: I should say that godict and ecdict were previously created as follows - I can submit the entire code if needs be:
godict = {}
ecdict = {}
godescr = {}
f = open("507.FINAL_14.2.14_2_nr.out_fmt5.out.annot")
for line in f:
line = line.split("\t")
if len(line) > 2:
godescr[line[0]] = line[2]
line[1] = line[1].strip("\n")
if line[1].startswith("EC"):
if line[0] in ecdict:
a = ecdict[line[0]]
a.append(line[1])
ecdict[line[0]] = a
else:
ecdict[line[0]] = [line[1]]
else:
if line[0] in godict:
a = godict[line[0]]
a.append(line[1])
godict[line[0]] = a
else:
godict[line[0]] = [line[1]]
Traceback:
Traceback (most recent call last):
File "2d.test.py", line 170, in <module>
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
KeyError: 'XLOC_000434'

Categories

Resources