xml libxml2 parsing

xml libxml2 parsing - python

In the code below, my problem is that it's writing output to all folders based on only one input file. Can some one give me a hint and check if my code is looping properly?
import libxml2
import os.path
from numpy import *
from cfs_utils import *
np=[1,2,3,4,5,6,7,8]
n=[20,30,40,60,80,100,130]
solver=["CG_iluk", "CG_saamg", "CG_ssor", "BiCGSTABL_iluk", "BiCGSTABL_saamg", "BiCGSTABL_ssor", "cholmod", "ilu" ]
file_list=["eval_CG_iluk_default","eval_CG_saamg_default", "eval_CG_ssor_default", "eval_BiCGSTABL_iluk", "eval_BiCGSTABL_saamg", "eval_BiCGSTABL_ssor","simp_cholmod_solver_3D_evaluate ", "simp_ilu_solver_3D_evaluate" ]
for sol in solver:
i=0
for cnt_np in np:
#open write_file= "Graphs/" + "Np"+ cnt_np + "/CG_iluk.dat"
#"Graphs/Np1/CG_iluk.dat"
write_file = open("Graphs/"+ "Np"+ str(cnt_np) + "/" + sol + ".dat", "w")
#loop through different unknowns
for cnt_n in n:
#open file "cfs_calculations_" + cnt_n +"np"+ cnt_np+ "/" + file_list(i) + "_default.info.xml"
read_file = "cfs_calculations_" +str(cnt_n) +"np"+ str(cnt_np) + "/" + file_list[i] + ".info.xml"
#read wall and cpu time and write
if os.path.exists(read_file):
doc = libxml2.parseFile(read_file)
xml = doc.xpathNewContext()
walltime = xpath(xml, "//cfsInfo/sequenceStep/OLAS/mechanic/solver/summary/setup/timer/#wall")
cputime = xpath(xml, "//cfsInfo/sequenceStep/OLAS/mechanic/solver/summary/setup/timer/#cpu")
unknowns = 3*cnt_n*cnt_n*cnt_n
write_file.write(str(unknowns) + "\t" + walltime + "\t" + cputime + "\n")
doc.freeDoc()
write_file.close()
i=i+1

Problem solved, I = o, was outside the loop

Related

How to add (merge) multiple images in one directory

I have two types (A and B) of images in one directory and I want to add them together (not concatenate) them like this:
A1.jpeg + B1.jpeg = Merged1.jpeg
A2.jpeg + B2.jpeg = Merged2.jpeg
      ...
AN.jpeg + BN.jpeg = MergedN.jpeg
I don't know how to customize my code so it would work for the whole directory:
import cv2
import os
for i,filenames in os.walk('.'):
A1 = cv2.imread('A1.jpeg',0)
B1 = cv2.imread('B1.jpeg',0)
image = cv2.add([A1,B1])
filename = ('Merged' + {i} + '.jpeg')
cv2.imwrite(filename, image)
Any ideas? Thanks
EDIT:
I added counter in for loop, because you cannot define for loop in the way I did before.
import cv2
import os
i=0
for filenames in os.walk('.'):
i = i + 1
A = "A" + str(i) + ".jpeg"
B = "B" + str(i) + ".jpeg"
Ai = cv2.imread(A,0)
Bi = cv2.imread(B,0)
image = cv2.add([Ai,Bi])
filename = ('Merged' + str(i) + '.jpeg')
cv2.imwrite(filename, image)
But it only adds A1 and B1. Is this a wrong way to count in for loop?

Just change the filename in imread with a custom variable which you can use i in.
A = "A" + str(i) + ".jpeg"
B = "B" + str(i) + ".jpeg"
Ai = cv2.imread(A,0)
Bi = cv2.imread(B,0)
I assumed that i is a number.

You can use glob for that it will be easier i think
import glob
for i, a_file in enumerate(glob.glob('./A*')):
A = cv2.imread(a_file,0)
B = cv2.imread(a_file.replace('A', 'B'),0)
image = cv2.add([A,B])
filename = ('Merged' + {i} + '.jpeg')
cv2.imwrite(filename, image)

I solved it like this:
import glob
import cv2
number_of_images = 750
for a_file in glob.glob('./A*'):
for i in range(1,number_of_images):
A = "A" + str(i) + ".jpeg"
B = "B" + str(i) + ".jpeg"
A1 = cv2.imread(A,0)
B1 = cv2.imread(B,0)
image = cv2.add([A1,B1])
filename = ('Merged' + str(i) + '.jpeg')
cv2.imwrite(filename, image)
print(i)
if i==number_of_images-1:
break
Thanks for your advices!

how to compare two strings in pandas large dataframe (python3.x)?

I have two DFs from 2 excel files.
1st file(awcProjectMaster)(1500 records)
projectCode projectName
100101 kupwara
100102 kalaroos
100103 tangdar
2nd file(village master)(more than 10 million records)
villageCode villageName
425638 wara
783651 tangdur
986321 kalaroo
I need to compare the projectName and villageName along with the percentage match.
The following code works fine but it is slow. How can I do the same thing in a more efficient way.
import pandas as pd
from datetime import datetime
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
with open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a") as f:
percentMatch = 0
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
percentMatch = (prjLen / vLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
elif prjLen >= vLen:
if prjName.find(vName) != -1:
percentMatch = (vLen / prjLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
f.close()
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)

Your distance metric for the strings isn't too accurate, but if it works for you, fine. (You may want to look into other options like the builtin difflib, or the Python-Levenshtein module, though.)
If you really do need to compare 1,500 x 10,000,000 records pairwise, things are bound to take some time, but there are a couple things that we can do pretty easily to speed things up:
open the log file only once; there's overhead, sometimes significant, in that
refactor your comparison function into a separate unit, then apply the lru_cache() memoization decorator to make sure each pair is compared only once, and the subsequent result is cached in memory. (In addition, see how we sort the vName/prjName pair – since the actual order of the two strings doesn't matter, we end up with half the cache size.)
Then for general cleanliness,
use the csv module for streaming CSV into a file (the output format is slightly different than with your code, but you can change this with the dialect parameter to csv.writer()).
Hope this helps!
import pandas as pd
from datetime import datetime
from functools import lru_cache
import csv
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
log_file = open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a")
log_writer = csv.writer(log_file)
#lru_cache()
def compare_vname_prjname(vName, prjName):
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
return (prjLen / vLen) * 100
elif prjLen >= vLen:
if prjName.find(vName) != -1:
return (vLen / prjLen) * 100
return None
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
# help the cache decorator out by halving the number of possible pairs:
vName, prjName = sorted([vName, prjName])
percent_match = compare_vname_prjname(vName, prjName)
if percent_match is None: # No match
return False
log_writer.writerow(
[
prjCode,
prjName,
vCode,
vName,
round(percent_match),
stCode,
stName,
dCode,
dName + sdCode,
sdName,
]
)
return True
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)

syntax error for writing a variable in a script

dump.pbd='pdb' + pdbFile + '_' + 'res' + residMin + '_' residMax + '.pdb'
the program keep giving me syntax error when I run it.
import re
import sys
import os
import time
from sys import argv
import xmltodict
if len(sys.argv) < 3:
message = '\n Get protein file in the form of pdf file from pdb website. \n\n Usage: '+sys.argv[0] + ' [4-letter PDBid] [resid range] \n' + ' Example: ' + sys.argv[0] + ' 2rh1 53-71\n' + ' Output File: pdb2rh1_res53-71.pdb'
print (message)
exit()
pdbID=sys.argv[1]
residRange=sys.argv[2]
residData=residRange.split('-')
residMin=int(residData[0])
residMax=int(residData[1])
twoletter=pdbID[1:3]
xmlfile=pdbID + '.xml'
pdbgzfile=pdbID + '.pdb.gz'
pdbFile=pdbID+'.pdb'
dump.pbd='pdb' + pdbFile + '_' + 'res' + residMin + '_' residMax + '.pdb'
wgetcom='wget https://files.rcsb.org/view/'+pdbFile+' -O '+pdbFile
print(wgetcom)
os.system(wgetcom)
f = open (pdbFile,'r')
k = 0
rc = 0
data = f.readlines()
g = open (dump.pdb, 'w')
for linedata in data:
line=linedata.strip()
words = line.split()
if(words[0] == 'ATOM'):
k=k+1
words[5]=int(line[22:26].strip())
if(words[5] in range(residMin,residMax+1)):
g.write(linedata)
for i in words:
if(i=='CA'):
rc = rc+1
print(rc)
the code is not working because it is giving me a syntax error for line number 22 that states dump.pbd='pdb' + pdbFile + '' + 'res' + residMin + '' residMax + '.pdb'. so can you please help me with that?
Thanks so much on advance!

You've forgotten to add a + sign.
This line should work: dump.pbd='pdb' + pdbFile + '' + 'res' + residMin + '' + residMax + '.pdb'
There must be a + sign between '_' and residMax as this is the Python concatenating strings syntax.

Import equation from text file in Python

I have a complex equation which is generated into a .txt file. I would like to import this equation (which is all the text in the .txt file) and make a function from it, which can be subsequently fit.
Does anybody know how I might go about this? The equation to be fitted is at the the very bottom. My feeble attempt to import is below...
myfile1= open("dummyfile.txt", 'r')
def fcn(J1,J2,T,k,g):
return myfile1.read()
"dummyfile.txt" contents:
B**2*N*(12.0*g**2*sp.exp(2.0*J2/(T*k)) + 60.0*g**2*sp.exp(6.0*J2/(T*k)) + 168.0*g**2*sp.exp(12.0*J2/(T*k)) + 360.0*g**2*sp.exp(20.0*J2/(T*k)) + 30.0*g**2*sp.exp((2.0*J1 + 4.0*J2)/(T*k)) + 168.0*g**2*sp.exp((4.0*J1 + 8.0*J2)/(T*k)) + 360.0*g**2*sp.exp((6.0*J1 + 14.0*J2)/(T*k)) + 180.0*g**2*sp.exp((8.0*J1 + 12.0*J2)/(T*k)) + 660.0*g**2*sp.exp((8.0*J1 + 22.0*J2)/(T*k)) + 660.0*g**2*sp.exp((12.0*J1 + 18.0*J2)/(T*k)) + 1092.0*g**2*sp.exp((16.0*J1 + 26.0*J2)/(T*k)) + 546.0*g**2*sp.exp((18.0*J1 + 24.0*J2)/(T*k)) + 1680.0*g**2*sp.exp((24.0*J1 + 32.0*J2)/(T*k)) + 1224.0*g**2*sp.exp((32.0*J1 + 40.0*J2)/(T*k)))/(3*T*k*(6*sp.exp(2.0*J2/(T*k)) + 10*sp.exp(6.0*J2/(T*k)) + 14*sp.exp(12.0*J2/(T*k)) + 18*sp.exp(20.0*J2/(T*k)) + 5*sp.exp((2.0*J1 + 4.0*J2)/(T*k)) + 14*sp.exp((4.0*J1 + 8.0*J2)/(T*k)) + 18*sp.exp((6.0*J1 + 14.0*J2)/(T*k)) + 9*sp.exp((8.0*J1 + 12.0*J2)/(T*k)) + 22*sp.exp((8.0*J1 + 22.0*J2)/(T*k)) + 22*sp.exp((12.0*J1 + 18.0*J2)/(T*k)) + 26*sp.exp((16.0*J1 + 26.0*J2)/(T*k)) + 13*sp.exp((18.0*J1 + 24.0*J2)/(T*k)) + 30*sp.exp((24.0*J1 + 32.0*J2)/(T*k)) + 17*sp.exp((32.0*J1 + 40.0*J2)/(T*k)) + 1))

You can do that with exec().
Code:
def build_function(filename):
with open(filename, 'rU') as f:
eqn = f.read().strip()
exec("def fcn(J1, J2, T, k, g):\n return ({})".format(eqn))
return locals()['fcn']
Test Code:
fcn = build_function('file1')
print(fcn(1, 2, 3, 4, 5))
File1:
J2 + T*k
Results:
14

Script Loop through files in directory

I have the following code which creates the txt file I require from a shp.file with the data I need. I have a folder called profiles containing a few number of shape files named (profil1.shp, profil2.shp, profil3.shp etc.). I was wondering how to create a loop so that the script creates for each file a txt file with the same name (eg. for profil1.shp create profil1.txt, profil2.shp create profil2.txt and so on).
import ogr, os, sys, osr
os.chdir('..\profiles')
file = open('profil1.txt', 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open('profil1.shp', 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()
edit: the code is returning a Could not open file.Photo of the folder containing the files and their respective names. Safe to assume I am doing something wrong.
import ogr, os, sys, osr,os.path
os.chdir = ('C:\Users\Andrei\Desktop\profil3')
l = os.listdir('C:\Users\Andrei\Desktop\profil3')
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
file = open(s1, 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open(i, 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()

You can use os.listdir() to list the files and folders in the current directory.
This returns a list of all files in the current directory (or the directory given to it as parameter , if no parameter is specified it checks the current directory) .
Then you can check for files with the name ending with .shp using string.endswith() function and then use that to create your new files.
Example of a small portion -
import os , os.path
l = os.listdir()
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
At the end s1 would contain the file with extension as .txt .
Then you can do your logic on this file, and keep on doing like this.
Full code would look something like -
import ogr, os, sys, osr,os.path
os.chdir('..\profiles')
l = os.listdir()
for i in l:
if i.endswith('.shp'):
s1 = s.split('.')[0] + '.txt'
file = open(s1, 'w')
driver = ogr.GetDriverByName('ESRI Shapefile')
datasource = driver.Open(i, 0)
if datasource is None:
print 'Could not open file'
sys.exit(1)
layer = datasource.GetLayer()
feature = layer.GetNextFeature()
while feature:
id = feature.GetFieldAsString('ID')
Distanta = feature.GetFieldAsString('DIST')
Z = feature.GetFieldAsString('Z')
geom = feature.GetGeometryRef()
x = str(geom.GetX())
y = str(geom.GetY())
file.write(id + " " + Distanta + " " + "[X]:" + " " + x + ' ' + '[Y]:' + " " + y + " " + " " + "[Z]" + Z + " " + "\n")
feature.Destroy()
feature = layer.GetNextFeature()
datasource.Destroy()
file.close()
A better way of openning files, etc is using with statement. Look up its tutorial here.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

xml libxml2 parsing - python

Problem solved, I = o, was outside the loop

Related

How to add (merge) multiple images in one directory

how to compare two strings in pandas large dataframe (python3.x)?

syntax error for writing a variable in a script

Import equation from text file in Python

Script Loop through files in directory

Categories

Resources