There are a log.txt.
"[25-Feb-2016 11:27:16 +0200]: Login failed .... 212.153.100.19 Get/.... emailaddress#email.com"........
How can i write a script which can grep or regex me only the dates/IP addresses and email addresses and write it out to an other .txt.
The most important thing is that i need dates and the corresponding IPs and emails.
I try it to with the next code, but it is segment all of the data ..
import os
import re
import datetime
filename = 'log.txt'
newfilename = 'output.txt'
if os.path.exists(filename):
data = open(filename,'r')
bulkemails = data.read()
else:
print "File not found."
raise SystemExit
r = re.compile(r'[\w\.-]+#[\w\.-]+\b')
results = r.findall(bulkemails)
emails = ""
for x in results:
emails += str(x)+"\n"
ip = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b')
result = ip.findall(bulkemails)
ip =""
for y in result:
ip += str(y)+"\n"
dt = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
result = dt.findall(bulkemails)
dt =""
for z in result:
dt += str(z)+"\n"
def writefile():
f = open(newfilename, 'w')
f.write(emails + ip + dt)
f.close()
print "File written."
def overwrite_ok():
response = raw_input("Are you sure you want to overwrite "+str(newfilename)+"? Yes or No\n")
if response == "Yes":
writefile()
elif response == "No":
print "Aborted."
else:
print "Please enter Yes or No."
overwrite_ok()
if os.path.exists(newfilename):
overwrite_ok()
else:
writefile()
So i whould like to same output.txt what is included the next :
25-Feb-2016 11:27:16 +0200] -- 212.153.100.19 -- emailaddress#email.com"
25-Feb-2016 11:27:16 +0200] -- 212.153.100.10 -- emailaddress1#email.com"
25-Feb-2016 11:27:16 +0200] -- 212.153.100.11 -- emailaddress2#email.com"
Thanks for help, and have a nice day :)
You should make a regex with three groups, one for the time, one for the IP and one for the email.
import re
my_regex = re.compile(r".+?(\d{2}-\w+-\d{4}).+?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).+?\b([\w.\d]+#[\w.\d]+)(?:\b|$)")
with open("somefile") as f_logs:
logs = f_logs.readlines()
for line in logs:
my_regex.sub(r"[\1] -- \2 -- \3",line)
You can check it on regex101
Related
So I am currently working on a program that was handed down to me from a previous coworker and I am working through a strange bug. When reading data output from 2 separate serial sources byte by byte, python will write to the same cell in the .csv file as well as the console.
import serial
from datetime import datetime
import os
pressure_passed = False
arduino_passed = False
file_passed = False
BAUD_RATE = 115200
GARBAGE_CYCLES = 3 # how many cycles to ignore before logging data
garbage_cycle = 0
# Save data to log file
def LogData(startTime, pressureData, arduinoData, file):
global garbage_cycle
if garbage_cycle < GARBAGE_CYCLES:
garbage_cycle += 1
else:
delta = datetime.now() - startTime
ms = delta.total_seconds() * 1000
dataString = "{:0.2f}, {}, {}\n".format(ms, pressureData, arduinoData)
file.write(dataString)
file.flush()
print(dataString, end = "")
# Get the COM port for the Mark-10 Series 5
while not pressure_passed:
try:
pressure_com = input("Enter Mark-10 Series 5 COM Port #: ")
pressure_ser = serial.Serial("COM" + str(pressure_com), BAUD_RATE)
pressure_passed = True
except:
print("Invalid COM Port, please enter a valid port.\n-----")
# Get the COM port for the Arduino
while not arduino_passed:
try:
arduino_com = input("Enter Ardunio COM Port #: ")
arduino_ser = serial.Serial("COM" + str(arduino_com), BAUD_RATE)
arduino_passed = True
except:
print("Invalid COM Port, please enter a valid port.\n-----")
# Get the name for the log file
while not file_passed:
try:
file_name = input("Enter log file name: ")
# Add extension if not already given
if "." not in file_name:
file_name += ".csv"
log_file = open(file_name, "a")
# Add header row to log file
if os.stat(log_file.name).st_size == 0:
log_file.write("time (ms), pressure, rate (deg/ms)")
file_passed = True
except:
print("Invalid file, or could not open the file specified.\n-----")
start = datetime.now()
# Variables to read serial input
pressure_data = ""
last_pressure = ""
arduino_data = ""
last_arduino = ""
# Main program loop
# Serial is read from byte by byte to better sync the two devices
while True:
try:
x_changed = False
y_changed = False
# Read from Mark-10 serial if available
# x is a byte read from the serial line, converted to ascii
if pressure_ser.in_waiting > 0:
x = pressure_ser.read().decode('ascii')
x_changed = True
# Read from Arduino serial if available
# y is a byte read from the serial line, converted to ascii
if arduino_ser.in_waiting > 0:
y = arduino_ser.read().decode('ascii')
y_changed = True
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
last_pressure = pressure_data
pressure_data = ""
elif x != '\r': # Otherwise, add the read character to the string
pressure_data += x
if y_changed:
if y == '\n': # New line detected, log the accumulated data
if last_arduino != arduino_data:
LogData(start, last_pressure, last_arduino, log_file)
last_arduino = arduino_data
arduino_data = ""
elif y != '\r': # Otherwise, add the read character to the string
arduino_data += y
except Exception as e:
print(e)
if arduino_ser.isOpen():
arduino_ser.close()
if pressure_ser.isOpen():
pressure_ser.close()
log_file.close()
break
Here is what the file is spitting out, IE the double printing to a single cell. Sample of the data
Any advice is much appreciated, thank you all!
It looks like when a new pressure is read in, but the value has not changed from last time, then it's not resetting the string that's accumulating all the characters and it doubles up. Then on the next pass, when the REAL pressure hasn't changed, it compares the doubled to the non-doubled and writes again, and vice-versa.
Try unindenting the line that resets the string to remove it from the if clause:
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
last_pressure = pressure_data
pressure_data = ""
elif x != '\r': # Otherwise, add the read character to the string
pressure_data += x
Then the same thing for the arduino value block.
Your logs will probably be much shorter now.
I like your username! My guess is that it is reading from the serial too quickly and going through the loop twice before the arduino has time to change the value of in_waiting.
At the top of your code add:
import time
And in the LogData function add:
time.sleep(0.1)
Give that a shot and let me know if it helps. 0.1s may be too long for your application but it is a good test to see if this is the issue. If it is, you can play around with the amount of time it sleeps.
Based on the sample output provided, I think it's not writing twice but rather the following specific condition is met occasionally which calls two identical LogData() lines.
Only when Condition 1 AND Condition 2 are met - the data is written "twice". Note that the LogData() call is same in both conditions.
Condition 1:
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
Condition 2:
if y_changed:
if y == '\n': # New line detected, log the accumulated data
if last_arduino != arduino_data:
LogData(start, last_pressure, last_arduino, log_file)
This program works for me except the changes that are made to the input .srt (subtitle) file are displayed but NOT saved back in the input file or a new file. Can anyone please tell me what is missing? Thanks. Stuart.
code follows:
#! /usr/bin/python
import sys;
# Patching SRT files to make them readable on Samsung TVs
# It basically inserts blank subtitles when silence should occur.
seqNo=1
try:
subs = open(sys.argv[1])
except:
print "Please provide subtile file to process"
sys.exit(1)
while True:
srtSeqNo=subs.readline();
try:
begin,arrow,end=subs.readline().rstrip('\n\r').split(" ")
except:
break
srtText = subs.readline();
again = subs.readline();
while len(again.strip('\n\r')) > 0:
srtText = srtText + again
again = subs.readline()
print "%d\n%s --> %s\n%s" % (seqNo, begin, end, srtText)
seqNo = seqNo + 1
print "%d\n%s --> %s\n%s\n" % (seqNo, end, end, " ")
seqNo = seqNo + 1
I have a script here with some basic funtions:
Function 1 - wget, opens a webpage and saves it to a local variable then closes.
Function 2 - scrapes this webpage for md5 hash values.
Function 3 - takes the hash values and cracks them using a dictionary of commonly used passwords.
My problem is getting my output from Function 2 and inserting it into Function 3. This is partly due to the output from Function 2 being a list and Function 3 is looking for just hash values.
You guys will most likely be able to understand more from reading my code, below is my code so far.
import sys, hashlib, re, urllib
def wget(url): # could import webpage_get and use wget() from there instead
'''Read the contents of a webpage from a specified URL'''
print '[+]---------------------------------------------------------------------------- ' #CHANGE THIS
# open URL
webpage = urllib.urlopen(url) # opens url like a file
# get page contents
page_contents = webpage.read() # reads content of webpage
return page_contents
page_contents = webpage.close() # close webpage
def findmd5(text):
'''Find all md5 hash values'''
md5value = re.findall(r'([a-fA-F\d]{32})', text)
count = len(md5value)
print "[+] Total number of md5 hash values found: %s" % count
for x in md5value:
print x
def dict_attack(passwd_hash):
dic = ['123','1234','12345','123456','1234567','12345678','password','qwerty','abc','abcd','abc123','111111','monkey','arsenal','letmein','trustno1','dragon','baseball','superman','iloveyou','starwars','montypython','cheese','123123','football','password','batman']
passwd_found = False
for value in dic:
hashvalue = hashlib.md5(value).hexdigest()
if hashvalue == passwd_hash:
passwd_found = True
recovered_password = value
if passwd_found == True:
print '[+] Password recovered: %s'% (recovered_password)
else:
print '[-] Password not recovered'
def main():
# temp testing url argument
sys.argv.append('URL HERE!')
# Check args
if len(sys.argv) != 2:
print '[-] Usage: email_analysis URL/filename'
return
#call functions
try:
print '[+] md5 values found: '
print findmd5(wget(sys.argv[1]))
print '[+] Cracking hash values: '
except IOError:
print 'Error'
if __name__ == '__main__':
main()
Any help is greatly appreciated!
wget: Set return statement as last statement.
findmd5: Changed from printing it's results, to returning them to a variable in main.
main: added in for loop to iterate over found hashes and apply dict_attack to each value.
I did however not build in any break or stop condition, so even if found, the program will continue running. It will however still print the found result.
import sys, hashlib, re, urllib
def wget(url): # could import webpage_get and use wget() from there instead
'''Read the contents of a webpage from a specified URL'''
print ('[+]---------------------------------------------------------------------------- ') #CHANGE THIS
# open URL
webpage = urllib.urlopen(url) # opens url like a file
# get page contents
page_contents = webpage.read() # reads content of webpage
page_contents = webpage.close() # close webpage
return page_contents
def findmd5(text):
'''Find all md5 hash values'''
md5value = re.findall(r'([a-fA-F\d]{32})', text)
count = len(md5value)
print ("[+] Total number of md5 hash values found: %s" % count)
return md5value
def dict_attack(passwd_hash):
dic = ['123','1234','12345','123456','1234567','12345678','password','qwerty','abc','abcd','abc123','111111','monkey','arsenal','letmein','trustno1','dragon','baseball','superman','iloveyou','starwars','montypython','cheese','123123','football','password','batman']
passwd_found = False
for value in dic:
hashvalue = hashlib.md5(value).hexdigest()
if hashvalue == passwd_hash:
passwd_found = True
recovered_password = value
if passwd_found == True:
print ('[+] Password recovered: %s'% (recovered_password))
else:
print ('[-] Password not recovered')
def main():
# temp testing url argument
sys.argv.append('URL HERE!')
# Check args
if len(sys.argv) != 2:
print ('[-] Usage: email_analysis URL/filename')
return
#call functions
try:
md5Values = findmd5(wget(sys.argv[1]))
for md5value in md5values:
dict_attack(md5value)
print ('[+] Cracking hash values: ')
except IOError:
print ('Error')
if __name__ == '__main__':
main()
Billy, return a a list of the hashes found, instead of printing them (it looks like you are thinking like it was bash, but you don't need to "print" the output of a function, as you did in bash, in python you can literally return an array with the elements found).
Your regexp for the hash uses \d, but that includes - as well, it might bring something that is not a MD5 hash.
I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)
A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''
A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.
I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()
I want to write a program that sends an e-mail to one or more specified recipients when a certain event occurs. For this I need the user to write the parameters for the mail server into a config. Possible values are for example: serveradress, ports, ssl(true/false) and a list of desired recipients.
Whats the user-friendliest/best-practice way to do this?
I could of course use a python file with the correct parameters and the user has to fill it out, but I wouldn't consider this user friendly. I also read about the 'config' module in python, but it seems to me that it's made for creating config files on its own, and not to have users fill the files out themselves.
Are you saying that the fact that the config file would need to be valid Python makes it unfriendly? It seems like having lines in a file like:
server = 'mail.domain.com'
port = 25
...etc would be intuitive enough while still being valid Python. If you don't want the user to have to know that they have to quote strings, though, you might go the YAML route. I use YAML pretty much exclusively for config files and find it very intuitive, and it would also be intuitive for an end user I think (though it requires a third-party module - PyYAML):
server: mail.domain.com
port: 25
Having pyyaml load it is simple:
>>> import yaml
>>> yaml.load("""a: 1
... b: foo
... """)
{'a': 1, 'b': 'foo'}
With a file it's easy too.
>>> with open('myconfig.yaml', 'r') as cfile:
... config = yaml.load(cfile)
...
config now contains all of the parameters.
I doesn't matter technically proficient your users are; you can count on them to screw up editing a text file. (They'll save it in the wrong place. They'll use MS Word to edit a text file. They'll make typos.) I suggest making a gui that validates the input and creates the configuration file in the correct format and location. A simple gui created in Tkinter would probably fit your needs.
I've been using ConfigParser. It's designed to read .ini style files that have:
[section]
option = value
It's quite easy to use and the documentation is pretty easy to read. Basically you just load the whole file into a ConfigParser object:
import ConfigParser
config = ConfigParser.ConfigParser()
config.read('configfile.txt')
Then you can make sure the users haven't messed anything up by checking the options. I do so with a list:
OPTIONS =
['section,option,defaultvalue',
.
.
.
]
for opt in OPTIONS:
section,option,defaultval = opt.split(',')
if not config.has_option(section,option):
print "Missing option %s in section %s" % (option,section)
Getting the values out is easy too.
val = config.get('section','option')
And I also wrote a function that creates a sample config file using that OPTIONS list.
new_config = ConfigParser.ConfigParser()
for opt in OPTIONS:
section,option,defaultval = opt.split(',')
if not new_config.has_section(section):
new_config.add_section(section)
new_config.set(section, option, defaultval)
with open("sample_configfile.txt", 'wb') as newconfigfile:
new_config.write(newconfigfile)
print "Generated file: sample_configfile.txt"
What are the drawbacks of such a solution:
ch = 'serveradress = %s\nport = %s\nssl = %s'
a = raw_input("Enter the server's address : ")
b = 'a'
bla = "\nEnter the port : "
while not all(x.isdigit() for x in b):
b = raw_input(bla)
bla = "Take care: you must enter digits exclusively\n"\
+" Re-enter the port (digits only) : "
c = ''
bla = "\nChoose the ssl option (t or f) : "
while c not in ('t','f'):
c = raw_input(bla)
bla = "Take care: you must type f or t exclusively\n"\
+" Re-choose the ssl option : "
with open('configfile.txt','w') as f:
f.write(ch % (a,b,c))
.
PS
I've read in the jonesy's post that the value in a config file may have to be quoted. If so, and you want the user not to have to write him/her-self the quotes, you simply add
a = a.join('""')
b = b.join('""')
c = c.join('""')
.
EDIT
ch = 'serveradress = %s\nport = %s\nssl = %s'
d = {0:('',
"Enter the server's address : "),
1:("Take care: you must enter digits exclusively",
"Enter the port : "),
2:("Take care: you must type f or t exclusively",
"Choose the ssl option (t or f) : ") }
def func(i,x):
if x is None:
return False
if i==0:
return True
elif i==1:
try:
ess = int(x)
return True
except:
return False
elif i==2:
if x in ('t','f'):
return True
else:
return False
li = len(d)*[None]
L = range(len(d))
while True:
for n in sorted(L):
bla = d[n][1]
val = None
while not func(n,val):
val = raw_input(bla)
bla = '\n '.join(d[n])
li[n] = val.join('""')
decision = ''
disp = "\n====== If you choose to process, =============="\
+"\n the content of the file will be :\n\n" \
+ ch % tuple(li) \
+ "\n==============================================="\
+ "\n\nDo you want to process (type y) or to correct (type c) : "
while decision not in ('y','c'):
decision = raw_input(disp)
disp = "Do you want to process (type y) or to correct (type c) ? : "
if decision=='y':
break
else:
diag = False
while not diag:
vi = '\nWhat lines do you want to correct ?\n'\
+'\n'.join(str(j)+' - '+line for j,line in enumerate((ch % tuple(li)).splitlines()))\
+'\nType numbers of lines belonging to range(0,'+str(len(d))+') separated by spaces) :\n'
to_modify = raw_input(vi)
try:
diag = all(int(entry) in xrange(len(d)) for entry in to_modify.split())
L = [int(entry) for entry in to_modify.split()]
except:
diag = False
with open('configfile.txt','w') as f:
f.write(ch % tuple(li))
print '-*- Recording of the config file : done -*-'