I am still new to Python, and have been working on this for work, and a few side projects with it for automating my Plex Media Management tasks.
I am trying to write a python script that would allow me to take a set list of domains from a csv file, match them to their dns name: Example (Plex.tv using 'NS' would return jeremy.ns.cloudflare.com)
My main goal is to read in the list of domains from a csv
run my code to match those domains to a dns resolver name
write those to either a new CSV file, and then zip the two together, which is what I have in my code.
I am having a few problems along the way.
Visual Code doesn't allow import dns.resolver (not a huge issue, but if you know the fix for that it would save me from having to run it from command line)
Matching Domains to their DNS resolver is throwing the error "AttributeError: 'list' object has no attribute 'is_absolute'"
import csv
import socket
import dns.resolver
import os
from os.path import dirname, abspath
# Setting Variables
current_path = dirname(abspath(__file__))
domainFName = '{0}/domains.csv'.format(current_path)
outputFile = '{0}/output.csv'.format(current_path)
dnsList = '{0}/list2.csv'.format(current_path)
case_list = []
fields = ['Domains', 'DNS Resolvers']
caseList = []
dnsResolve = []
# Read in all domains from csv into list
with open(domainFName, 'r') as file:
for line in csv.reader(file):
case_list.append(line)
print(case_list)
# Match domains to the DNS Resolver Name
for domains in case_list:
answer = dns.resolver.resolve(domains, 'NS')
server = answer.target
dnsResolve.append(server)
# Write the dns Resolver names into a new csv file
with open(dnsList,'w', newline="") as r:
writers = csv.writer(r)
writers.writerows(caseList)
# Write the domains and dns resolvers to new output csv
with open(outputFile,'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(fields)
writer.writerow(zip(case_list,caseList))
exit()
Thanks for any help
After a discussion with a co-worker, I was able to resolve my issue, and just for the sake of it, if anyone wants to use this code for a similar need (we use it for DMARC), I will post the whole code:
import dns.resolver
import csv
import os
from os.path import dirname, abspath
# Setting Variables
current_path = dirname(abspath(__file__))
domainFName = '{0}/domains.csv'.format(current_path)
outputFile = '{0}/output.csv'.format(current_path)
dnsList = '{0}/dnslist.csv'.format(current_path)
backupCSV = '{0}/backup-output.csv'.format(current_path)
case_list = []
dns_list = []
fields = ['Domains', 'DNS Resolvers']
csv_output = zip(case_list, dns_list)
domainAmount = 0
rd = 00
dnresolve = 00
part = 0
percentL = []
percents = [10,20,30,40,50,60,70,80,90,95,96,97,98,99]
percentList = []
floatingList = []
floatPart = []
x = 00
keyAzure = 'azure'
keyCSC = 'csc'
while x < .99:
x += .01
floatingList.append(x)
# THIS IS THE CODE FOR WRITING CSV FILES INTO LISTS - LABELED AS #1
print("FILES MUST BE CSV, WILL RETURN AN ERROR IF NOT. LEAVE OFF .CSV")
# Here we will gather the input of which csv file to use. If none are entered, it will use domains.csv
print("Enter your output file name (if blank will use default):")
UserFile = str(input("Enter your filename: ") or "domains")
fullFile = UserFile + '.csv'
domainFName = fullFile.format(current_path)
# Here will will specify the output file name. If the file is not created, it will create it
# If the user enters not data, the default will be used, output.csv
print("Enter your output file name (if blank will use default):")
UserOutput = str(input("Enter your filename: ") or "output")
fullOutput = UserOutput + '.csv'
outputFIle = fullOutput.format(current_path)
# Read in all domains from csv into list
with open(domainFName, 'r') as file:
for line in csv.reader(file):
case_list.append(line)
domainAmount += 1
print("Starting the resolver:")
print("You have " + str(domainAmount) + " Domains to resolve:")
# THIS IS THE END OF THE CODE FOR WRITING CSV FILES INTO LISTS - LABELED AS #1
# THE CODE BELOW IS WORKING FOR FINDING THE DNS RESOLVERS - LABELED AS #2
# Function for matching domains to DNS resolvers
def dnsResolver (domain):
try:
answers = dns.resolver.resolve(domain, 'NS')
for server in answers:
dns_list.append(server.target)
except:
dns_list.append("Did Not Resolve")
print("Now resolving domains to their DNS name:")
print("This will take a few minutes. Check out the progress bar for your status:")
print("I have resolved 0% Domains:")
# This code is for finding the percentages for the total amount of domains to find progress status
def percentageFinder(percent, whole):
return (percent * whole) / 100
def percentGetter(part, whole):
return (100 * int(part)/int(whole))
for x in percents:
percentList.append(int(percentageFinder(x,domainAmount)))
percentL = percentList
#End code for percentage finding
for firstdomain in case_list:
for domain in firstdomain:
dnsResolver(domain)
if dnsResolver != "Did Not Resolve":
rd += 1
else:
dnresolve += 1
# Using w+ to overwrite all Domain Names &
with open(dnsList,'w+', newline="") as r:
writers = csv.writer(r)
writers.writerows(dns_list)
# This is used for showing the percentage of the matching you have done
part += 1
if part in percentL:
total = int(percentGetter(part, domainAmount))
print("I Have Resolved {}".format(total) + "%" + " Domains:")
else:
pass
print("Resolving has completed. Statistics Below:")
print("------------------------------------------")
print("You had " + str(rd) + " domains that resolved.")
print("You had " + str(dnresolve) + " domains that did NOT resolve")
# THIS IS THE END OF THE WORKING CODE - LABELED AS #2
# Write the dns Resolver names into a new csv file
print("Now writing your domains & their DNS Name to an Output File:")
with open(outputFile,'w+', newline="\n") as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(fields)
for row in csv_output:
writer.writerow(row)
print("Writing a backup CSV File")
# Using this to create a backup in case to contain all domains, and all resolvers
# If someone runs the script with a small list of domains, still want to keep a
# running list of everything in case any questions arise.
# This is done by using 'a' instead of 'w' or 'w+' done above.
with open(backupCSV,'w', newline="") as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(fields)
for row in csv_output:
writer.writerow(row)
print("Your backup is now done processing. Exiting program")
# Sort the files by keyword, in this case the domain being azure or csc
for r in dns_list:
if keyAzure in r:
for x in keyAzure:
FileName = x
print(FileName)
exit()
Related
Here is some code I'm working on requiring string and the opening and closing of files.
#Importing required Packages---------------------------------------------
import string
# Importing Datasets-----------------------------------------------------
allNames = open("allNames.csv", "r")
onlyNames = open("onlyNames.csv", "r")
#=========Tasks==========================================================
# [1] findName(name, outputFile)-----------------------------------------
# Works ####
def findName(name, outputFile):
outfile = open(outputFile + ".csv", "w") # Output file
outfile.write("Artist \tSong \tYear\n") # Initial title lines
alreadyAdded = [] # List of lines already added to remove duplicates
for aline in allNames: # Looping through allNames.csv
fields = aline.split("\t") # Splitting elements of a line into a list
if fields[-1] == name + "\n": # Selecting lines with only the specified name (last element)
dataline = fields[0] + "\t" + fields[1] + "\t" + fields[3] # Each line in the .csv file
if dataline not in alreadyAdded: # Removing Duplicates
outfile.write(dataline + "\n") # Writing the file
alreadyAdded.append(dataline) # Adding lines already added
outfile.close()
# findName("Mary Anne", "mary anne")
# findName("Jack", "jack")
# findName("Mary", "mary")
# findName("Peter", "peter")
The code serves its intended purpose as I get an exported file. However, this only works for one function at a time, for example if I try to run both findName("Mary Anne", "mary anne") and findName("Jack", "jack") at the same time, the second instance of the function does not work. Moreover, all subsequent functions on the project file do not work unless I comment out this code.
Let me know what the issue is, thank you!
I'm attempting to read and parse a .txt file that is continually being updated throughout the day. I want to parse only lines that have not already been consumed. These are then to be sent to a Telegram group.
At present, every time I run the script it parses everything.
selections = []
msgList = []
urr = ""
name = ""
ourLines=len(selections)
while(True):
file1 = open(r'C:\\urlt\log.txt', 'r')
Lines = file1.readlines()
file1.close()
try:
while(True):
if(ourLines==len(Lines)):
break
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines=ourLines+1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = "www.beefandtuna.com/%23"+a.replace("%23", "/").strip('(')
#print(url)
urr = url
elif "bet=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
#print(name)
selections.append(url+name)
msg = url +" " '\n' "Name: "+ name
if msg not in msgList:
post_to_telegram(msg)
msgList.append(msg)
#time.sleep(0.5)
except:
pass
Assuming the new contents are appended to the end of the file: after you finish reading the file, create a copy of the file.
The next time you read the file, seek to the location that is the length of the copy.
import os
from shutil import copyfile
in_file_loc = r'C:\\SmartBet.io Bot\placerlog.txt'
backup_file_loc = in_file_loc + ".bak"
while True:
try:
file_backup_size = os.stat(backup_file_loc).st_size
except:
file_backup_size = 0
file1 = open(in_file_loc, 'r')
# move file position to the end of the old file
file1.seek(file_backup_size)
# Read all lines in the file after the position we seek-ed to
Lines = file1.readlines()
file1.close()
# copy current version of file to backup
copyfile(in_file_loc, backup_file_loc)
# Then do whatever you want to do with Lines
This is probably not the best way to do this because, as rici said in a comment below:
"make a copy" is not an atomic operation, and as the file grows copying will be successively slower. Any data appended to the log file during the copy will never be reported. Furthermore, the copy might happen to include a partial entry, in which case the next scan will start in the middle of an entry.
An alternative is to save the size of the current file in a different one:
in_file_loc = r'C:\\SmartBet.io Bot\placerlog.txt'
size_file_loc = in_file_loc + ".lastsize"
while True:
# read old size from file
try:
with open(size_file_loc, 'r') as f:
file_size = int(f.read())
except:
# if error, file size is zero
file_size = 0
file1 = open(in_file_loc, 'r')
file1.seek(file_size)
Lines = file1.readlines()
new_file_size = file1.tell() # Get the location of the current file marker
file1.close()
# write new size to file
with open(size_file_loc, 'w') as f:
f.write(str(new_file_size))
# Then do whatever you want to do with Lines
I am trying to extract values from json ld to csv as they are in the file. There are a couple of issues I am facing.
1. The values being read for different fields are getting truncated in most of the cases. In the remaining cases the value of some other field is appearing in some other field.
2. I am also getting an error - 'Additional data' after some 4,000 lines.
The file is quite big(half a gb). I am attaching a shortened version of my code. Please tell me where am I going wrong.
The input file - I have shortened it and kept it here. There was no way of putting it here.
https://github.com/Architsi/json-ld-issue
I tried writing this script and I tried multiple online converters too
import csv, sys, math, operator, re, os, json, ijson
from pprint import pprint
filelist = []
for file in os.listdir("."):
if file.endswith(".json"):
filelist.append(file)
for input in filelist:
newCsv = []
splitlist = input.split(".")
output = splitlist[0] + '.csv'
newFile = open(output, 'w', newline='') #wb for windows, else you'll see newlines added to csv
# initialize csv writer
writer = csv.writer(newFile)
#Name of the columns
header_row = ('Format', 'Description', 'Object', 'DataProvider')
writer.writerow(header_row)
with open(input, encoding="utf8") as json_file:
data = ijson.items(json_file, 'item')
#passing all the values through try except
for s in data:
source = s['_source']
try:
source_resource = source['sourceResource']
except:
print ("Warning: No source resource in record ID: " + id)
try:
data_provider = source['dataProvider'].encode()
except:
data_provider = "N/A"
try:
_object = source['object'].encode()
except:
_object = "N/A"
try:
descriptions = source_resource['description']
string = ""
for item in descriptions:
if len(descriptions) > 1:
description = item.encode() #+ " | "
else:
description = item.encode()
string = string + description
description = string.encode()
except:
description = "N/A"
created = ""
#writing it to csv
write_tuple = ('format', description, _object, data_provider)
writer.writerow(write_tuple)
print ("File written to " + output)
newFile.close()
The error that I am getting is this- raise common.JSONError('Additional Data')
Expected result is a csv file with all the columns and correct values
I get files that have NTFS audit permissions and I'm using Python to parse them. The raw CSV files list the path and then which groups have which access, such as this type of pattern:
E:\DIR A, CREATOR OWNER FullControl
E:\DIR A, Sales FullControl
E:\DIR A, HR Full Control
E:\DIR A\SUBDIR, Sales FullControl
E:\DIR A\SUBDIR, HR FullControl
My code parses the file to output this:
File Access for: E:\DIR A
CREATOR OWNER,FullControl
Sales,FullControl
HR,FullControl
File Access For: E:\DIR A\SUBDIR
Sales,FullControl
HR,FullControl
I'm new to generators but I'd like to use them to optimize my code. Nothing I've tried seems to work, so here is the original code (I know it's ugly). It works but it's very slow. The only way I can do this is by parsing out the paths first, put them in a list, make a set so that they're unique, then iterate over that list and match them with the path in the second list, and list all of the items it finds. Like I said, it's ugly but works.
import os, codecs, sys
reload(sys)
sys.setdefaultencoding('utf8') // to prevent cp-932 errors on screen
file = "aud.csv"
outfile = "access-2.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as infile:
for line in infile:
newline = line.split(',')
folder = newline[0].replace("\"","")
user = newline[1].replace("\"","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in range(1,len(newfl)):
searchItem = str(newfl[i])
with codecs.open(outfile,"a",'utf-8-sig') as output:
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
output.write(outtext)
for item in accesslist:
searchBreak = item.split(",")
searchTarg = searchBreak[0]
if searchItem == searchTarg:
searchBreaknew = searchBreak[1].replace("FSA-INC01S\\","")
searchBreaknew = str(searchBreaknew)
# print(searchBreaknew)
searchBreaknew = searchBreaknew.replace(" ",",")
searchBreaknew = searchBreaknew.replace("CREATOR,OWNER","CREATOR OWNER")
output.write(searchBreaknew)
How should I optimize this?
EDIT:
Here is an edited version. It works MUCH faster, though I'm sure it can still be fixed:
import os, codecs, sys, csv
reload(sys)
sys.setdefaultencoding('utf8')
file = "aud.csv"
outfile = "access-3.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as csvinfile:
auditfile = csv.reader(csvinfile, delimiter=",")
for line in auditfile:
folder = line[0]
user = line[1].replace("FSA-INC01S\\","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in xrange(1,len(newfl)):
searchItem = str(newfl[i])
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
accessUserlist = ""
for item in accesslist:
searchBreak = item.split(",")
if searchItem == searchBreak[0]:
searchBreaknew = str(searchBreak[1]).replace(" ",",")
searchBreaknew = searchBreaknew.replace("R,O","R O")
accessUserlist += searchBreaknew+"\r\n"
with codecs.open(outfile,"a",'utf-8-sig') as output:
output.write(outtext)
output.write(accessUserlist)
I'm misguided from your used .csv file extension.
Your given expected output isn't compatible with csv, as inside a record no \n possible.
Proposal using a generator returning record by record:
class Audit(object):
def __init__(self, fieldnames):
self.fieldnames = fieldnames
self.__access = {}
def append(self, row):
folder = row[self.fieldnames[0]]
access = row[self.fieldnames[1]].strip(' ')
access = access.replace("FSA-INC01S\\", "")
access = access.split(' ')
if len(access) == 3:
if access[0] == 'CREATOR':
access[0] += ' ' + access[1]
del access[1];
elif access[1] == 'Full':
access[1] += ' ' + access[2]
del access[2];
if folder not in self.__access:
self.__access[folder] = []
self.__access[folder].append(access)
# Generator for class Audit
def __iter__(self):
record = ''
for folder in sorted(self.__access):
record = folder + '\n'
for access in self.__access[folder]:
record += '%s\n' % (','.join(access) )
yield record + '\n'
How to use it:
def main():
import io, csv
audit = Audit(['Folder', 'Accesslist'])
with io.open(file, "r", encoding='utf-8') as csc_in:
for row in csv.DictReader(csc_in, delimiter=","):
audit.append(row)
with io.open(outfile, 'w', newline='', encoding='utf-8') as txt_out:
for record in audit:
txt_out.write(record)
Tested with Python:3.4.2 - csv:1.0
I am looking for a python solution to extract multiple sequences from a FASTA file into multiple files, based on a match to a list of header ID's in a separate file.
This is slightly more complex version of the problem posted on Extract sequences from a FASTA file based on entries in a separate file and https://www.biostars.org/p/2822/ which only output a single file for all matches.
I am new to python and am trying to find a way to:
Take a file containing strings that will be in the fasta headers
Have all records that match to a string, written to a separate fasta file
header_ID_strings file looks like this:
CAP357_2030_09WPI, CAP357_2040_11WPI, CAP357_2050_13WPI, etc...
a sample of my fasta file looks like this:
>CAP357_2030_009wpi_v1v3_1_056_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGG
>CAP357_2040_011wpi_v1v3_1_008_00006_001.1
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_030_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_004_00001_000.2
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2050_013wpi_v1v3_1_047_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
expected output
file1: CAP357_2030_009wpi_v1v3.fasta
>CAP357_2030_009wpi_v1v3_1_056_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGG
file2: CAP357_2040_011wpi_v1v3.fasta
>CAP357_2040_011wpi_v1v3_1_008_00006_001.1
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_030_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_004_00001_000.2
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
etc...
This code is from the above link, but I want to have:
* matches written to separate outfiles
* I don't have to specify each outfile separately, if possible (I will have up to 30 outfiles)
#!/usr/bin/env python
import sys
from Bio import SeqIO
input_file = sys.argv[1]
id_file = sys.argv[2]
output_file = sys.argv[3]
wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file))
print "Found %i unique identifiers in %s" % (len(wanted), id_file)
index = SeqIO.index(input_file, "fasta")
records = (index[r] for r in wanted)
count = SeqIO.write(records, output_file, "fasta")
assert count == len(wanted)
print "Saved %i records from %s to %s" % (count, input_file, output_file)
So far this is what I have come up with (script below), but don't know how to get around manually specifying all the outfiles and variables (I have only included three here)
from Bio import SeqIO
import pandas as pd
import sys
input_file = sys.argv[1]
id_file = sys.argv[2]
output_file2020 = sys.argv[3]
output_file2030 = sys.argv[4]
output_file2040 = sys.argv[5]
colnames = ["2020", "2030", "2040"]
headerlist = pd.read_csv(id_file, names = colnames, header = None)
infile = list(SeqIO.parse(input_file, "fasta"))
2020_seq = tuple(headerlist.2020)
2030_seq = tuple(headerlist.2030)
2040_seq = tuple(headerlist.2040)
count2020 = 0
count2030 = 0
count2040 = 0
for record in infile:
if record.id in 2020_seq:
SeqIO.write([record], output_file2020, "fasta")
countSU += 1
elif record.id in PI_seq:
SeqIO.write([record], output_file2030, "fasta")
countPI += 1
elif record.id in REC_seq:
SeqIO.write([record], output_file2040, "fasta")
countREC += 1
else:
print("no matches found")
print("number of SU is", count2020)
print("number of PI is", count2030)
print("number of REC is", count2040)
For the sake of completeness, here is the 'final' script:
#!/usr/bin/env python
# a script to extract fasta records from a fasta file to multiple separate fasta files based on a particular ID (time point) in a particular field, for a given delimiter
# to run, navigate to file location with command prompt and enter: python split_fasta_by_collections.py infile.fasta
from Bio import SeqIO
import os
import sys
records = SeqIO.parse(sys.argv[1], "fasta")
collected = {}
for record in records:
descr = record.description.split("_")[1] # "_" sets the delimeter, "1" sets the field where counting starts at 0 for the first field
try:
collected[descr].append(record)
except KeyError:
collected[descr] = [record ,]
file_name = "outfile%s.fasta"
file_path = os.getcwd() #sets the output file path to your current working directory
for (descr, records) in collected.items():
with open(os.path.join(file_path, file_name % descr), 'w') as f:
SeqIO.write(records, f, 'fasta')
A couple brief suggestions:
If all your headers follow the same pattern, then you can extract the unique elements:
record.description.split("_")[1]
(yields "2040" from "CAP357_2040_011wpi_v1v3_1_008_00006_001.1")
If you use a dict you can assemble collections of records:
collected = {}
for record in records:
descr = record.description.split("_")[1]
try:
collected[descr].append(record)
except KeyError:
collected[descr] = [record ,]
Then you can write out each collection to a new file:
file_name = "outfile%s"
for (descr, records) in collected.items(): # iteritems in python2
with open(os.path.join(file_path, file_name % descr), 'w') as f:
SeqIO.write(records, f, 'fasta')