Hello i am working on a project that prints extracted url to a text file....
I want a little help in my python program it's not working to write all the extracted url to the file whereas this writes the last url only to the file ......
def do_this():
print url
text_file = open("Output.txt", "w")
text_file.write("Extracted URL : %s" % url)
while True:
url, n = getURL(page)
page = page[n:]
if url:
do_this()
else:
text_file.close()
break
I can't find the solution !!
Sorry For Bad English.. Please Help !!
Use a to append, each time you open using w you overwrite, appending will append after any existing lines:
def do_this():
print url
text_file = open("Output.txt", "a")
text_file.write("Purchase Amount: %s\n" % url)
It would probably make more sense to just open once moving the logic into the function, something like:
def do_this(page):
with open("Output.txt", "a") as f:
while True:
url, n = getURL(page)
page = page[n:]
if not url:
break
f.write("Extracted URL : %s\n" % url)
I also added a \n to the write or all your data will be on a single line
Related
I am writing a python program, because I am lazy, that checks a website for a job opening I have been told about and returns all the jobs the companies web page.
Here is my code so far (yes I know the code is jancky however I am just trying to get it working)
import requests
from bs4 import BeautifulSoup
import sys
import os
import hashlib
reload(sys)
sys.setdefaultencoding('utf8')
res = requests.get('WEBSITE URL', verify=False)
res.raise_for_status()
filename = "JobWebsite.txt"
def StartUp():
if not os.path.isfile(filename):
try:
jobfile = open(filename, 'a')
jobfile = open(filename, 'r+')
print("[*] Succesfully Created output file")
return jobfile
except:
print("[*] Error creating output file!")
sys.exit(0)
else:
try:
jobfile = open(filename, 'r+')
print("[*] Succesfully Opened output file")
return jobfile
except:
print("[*] Error opening output file!")
sys.exit(0)
def AnyChange(htmlFile):
fileCont = htmlFile.read()
FileHash = hasher(fileCont, "File Code Hashed")
WebHash = hasher(res.text, "Webpage Code Hashed")
!!!!! Here is the Problem
print ("[*] File hash is " + str(FileHash))
print ("[*] Website hash is " + str(WebHash))
if FileHash == WebHash:
print ("[*] Jobs being read from file!")
num_of_jobs(fileCont)
else:
print("[*] Jobs being read from website!")
num_of_jobs(res.text)
deleteContent(htmlFile)
writeWebContent(htmlFile, res.text)
def hasher(content, message):
content = hashlib.md5(content.encode('utf-8'))
return content
def num_of_jobs(htmlFile):
content = BeautifulSoup(htmlFile, "html.parser")
elems = content.select('.search-result-inner')
print("[*] There are " + str(len(elems)) + " jobs available!")
def deleteContent(htmlFile):
print("[*] Deleting Contents of local file! ")
htmlFile.seek(0)
htmlFile.truncate()
def writeWebContent(htmlFile, content):
htmlFile = open(filename, 'r+')
print("[*] Writing Contents of website to file! ")
htmlFile.write(content.encode('utf-8'))
jobfile = StartUp()
AnyChange(jobfile)
The problem I currently have is that I hash both of the websites html code and the files html code. However both of the hashes don't match, like ever, I am not sure and can only guess that it might be something with the contents being save in a file. The hashes aren't too far apart but it still causes the If statement to fail each time
Breakpoint in Program with hashes
The screenshot you have attached is showing the location of the two hash objects fileHash and webHash. They should be in different locations.
What you really want to compare is the hexdigest() of the two hash objects. Change your if statement to:
if FileHash.hexdigest() == WebHash.hexdigest():
print ("[*] Jobs being read from file!")
num_of_jobs(fileCont)
Take a look at this other StackOverflow answer for some more how-to.
I have a csv file that has a bunch of urls in it. I am working on a script that does the following:
Read urls in from the csv
Pass url list to request.get()
and then
Get initial http status along with the redirected url and http status
My code works fine when I only pass in a single url and don't try and read the urls from another csv. However at scale it makes the most sense to read bulk urls from a file and then process the output to another file. Can you please help me understand where my code is breaking down? Thanks in advance.
# -*- coding: utf-8 -*-
import requests
import csv
import time
# set filename with date + time
timestr = time.strftime("%m-%d-%Y-%H-%M-%S")
filename = str("redirect-output\\redirect-test-"+timestr+".csv")
with open('bulk-url-filename.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
urls = row[0]
#print urls
r = requests.get(urls)
f = open(filename, 'a+')
response = requests.get(r)
if response.history:
print "Request was redirected:"
reqredirected = "Request was redirected:"
f.write(reqredirected)
f.write("\n")
for resp in response.history:
print resp.status_code
status = str(resp.status_code)
f.write(status)
f.write("\n")
print resp.url
url = str(resp.url)
f.write(url)
f.write("\n")
print "Final destination:"
final = "Final destination:"
f.write(final)
f.write("\n")
print response.status_code
destinationstatus = str(response.status_code)
f.write(destinationstatus)
f.write("\n")
print response.url
destinationurl = str(response.url)
f.write(destinationurl)
print "\n"
f.write("\n")
else:
print "Request was not redirected"
noredirect = "Request was not redirected"
f.write(noredirect)
f.write("\n")
responseurl = response.url
print responseurl
f.write(responseurl)
f.write("\n")
I am printing output to the console so I can see what is happening as well as writing lines into the csv for later analysis.
I am trying to access opencorporates.com. The page says that this is authenticated version for the GET method http://api.opencorporates.com/companies/gb/00102498?api_token=ab123cd45.
This is my code to access the dataset. Here I am changing the jurisdiction code through the codes that I collected in a file. Even if I don't use the api token I am able to collect the same amount of data that I can with the api token. Am I doing something wrong here?
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
The end of your url looks like this: ?api_token=ab123cd45, but it's already in the query string portion of the url, so it should look like: &api_token=ab123cd45. (Replace the ? with a &.)
You should consider using Requests when working with APIs.
So I am trying to check whether a url exists and if it does I would like to write the url to a file using python. I would also like each url to be on its own line within the file. Here is the code I already have:
import urllib2
CREATE A BLANK TXT FILE THE DESKTOP
urlhere = "http://www.google.com"
print "for url: " + urlhere + ":"
try:
fileHandle = urllib2.urlopen(urlhere)
data = fileHandle.read()
fileHandle.close()
print "It exists"
Then, If the URL does exist, write the url on a new line in the text file
except urllib2.URLError, e:
print 'PAGE 404: It Doesnt Exist', e
If the URL doesn't exist, don't write anything to the file.
The way you worded your question is a bit confusing but if I understand you correctly all your trying to do is test if a url is valid using urllib2 and if it is write the url to a file? If that is correct the following should work.
import urllib2
f = open("url_file.txt","a+")
urlhere = "http://www.google.com"
print "for url: " + urlhere + ":"
try:
fileHandle = urllib2.urlopen(urlhere)
data = fileHandle.read()
fileHandle.close()
f.write(urlhere + "\n")
f.close()
print "It exists"
except urllib2.URLError, e:
print 'PAGE 404: It Doesnt Exist', e
If you want to test multiple urls but don't want to edit the the python script you could use the following script by typing python python_script.py "http://url_here.com". This is made possible by using the sys module where sys.argv[1] is equal to the first argument passed to python_script.py. Which in this example is the url ('http://url_here.com').
import urllib2,sys
f = open("url_file.txt","a+")
urlhere = sys.argv[1]
print "for url: " + urlhere + ":"
try:
fileHandle = urllib2.urlopen(urlhere)
data = fileHandle.read()
fileHandle.close()
f.write(urlhere+ "\n")
f.close()
print "It exists"
except urllib2.URLError, e:
print 'PAGE 404: It Doesnt Exist', e
Or if you really wanted to make your job easy you could use the following script by typing the following into the command line python python_script http://url1.com,http://url2.com where all the urls you wish to test are separated by commas with no spaces.
import urllib2,sys
f = open("url_file.txt","a+")
urlhere_list = sys.argv[1].split(",")
for urls in urlhere_list:
print "for url: " + urls + ":"
try:
fileHandle = urllib2.urlopen(urls)
data = fileHandle.read()
fileHandle.close()
f.write(urls+ "\n")
print "It exists"
except urllib2.URLError, e:
print 'PAGE 404: It Doesnt Exist', e
except:
print "invalid url"
f.close()
sys.argv[1].split() can also be replaced by a python list within the script if you don't want to use the command line functionality. Hope this is of some use to you and good luck with your program.
note
The scripts using command line inputs were tested on the ubuntu linux, so if you are using windows or another operating system I can't guarantee that it will work with the instructions given but it should.
How about something like this:
import urllib2
url = 'http://www.google.com'
data = ''
try:
data = urllib2.urlopen(url).read()
except urllib2.URLError, e:
data = 'PAGE 404: It Doesnt Exist ' + e
with open('outfile.txt', 'w') as out_file:
out_file.write(data)
Use requests:
import requests
def url_checker(urls):
with open('somefile.txt', 'a') as f:
for url in urls:
r = requests.get(url)
if r.status_code == 200:
f.write('{0}\n'.format(url))
url_checker(['http://www.google.com','http://example.com'])
I'm trying to create a script which makes requests to random urls from a txt file e.g.:
import urllib2
with open('urls.txt') as urls:
for url in urls:
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
print '[{}]: '.format(url), "Up!"
But I want that when some url indicates 404 not found, the line containing the URL is erased from the file. There is one unique URL per line, so basically the goal is to erase every URL (and its corresponding line) that returns 404 not found.
How can I accomplish this?
You could simply save all the URLs that worked, and then rewrite them to the file:
good_urls = []
with open('urls.txt') as urls:
for url in urls:
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
print '[{}]: '.format(url), "Up!"
good_urls.append(url)
with open('urls.txt', 'w') as urls:
urls.write("".join(good_urls))
The easiest way is to read all the lines, loop over the saved lines and try to open them, and then when you are done, if any URLs failed you rewrite the file.
The way to rewrite the file is to write a new file, and then when the new file is successfully written and closed, then you use os.rename() to change the name of the new file to the name of the old file, overwriting the old file. This is the safe way to do it; you never overwrite the good file until you know you have the new file correctly written.
I think the simplest way to do this is just to create a list where you collect the good URLs, plus have a count of failed URLs. If the count is not zero, you need to rewrite the text file. Or, you can collect the bad URLs in another list. I did that in this example code. (I haven't tested this code but I think it should work.)
import os
import urllib2
input_file = "urls.txt"
debug = True
good_urls = []
bad_urls = []
bad, good = range(2)
def track(url, good_flag, code):
if good_flag == good:
good_str = "good"
elif good_flag == bad:
good_str = "bad"
else:
good_str = "ERROR! (" + repr(good) + ")"
if debug:
print("DEBUG: %s: '%s' code %s" % (good_str, url, repr(code)))
if good_flag == good:
good_urls.append(url)
else:
bad_urls.append(url)
with open(input_file) as f:
for line in f:
url = line.strip()
try:
r = urllib2.urlopen(url)
if r.code in (200, 401):
print '[{0}]: '.format(url), "Up!"
if r.code == 404:
# URL is bad if it is missing (code 404)
track(url, bad, r.code)
else:
# any code other than 404, assume URL is good
track(url, good, r.code)
except urllib2.URLError as e:
track(url, bad, "exception!")
# if any URLs were bad, rewrite the input file to remove them.
if bad_urls:
# simple way to get a filename for temp file: append ".tmp" to filename
temp_file = input_file + ".tmp"
with open(temp_file, "w") as f:
for url in good_urls:
f.write(url + '\n')
# if we reach this point, temp file is good. Remove old input file
os.remove(input_file) # only needed for Windows
os.rename(temp_file, input_file) # replace original input file with temp file
EDIT: In comments, #abarnert suggests that there might be a problem with using os.rename() on Windows (at least I think that is what he/she means). If os.rename() doesn't work, you should be able to use shutil.move() instead.
EDIT: Rewrite code to handle errors.
EDIT: Rewrite to add verbose messages as URLs are tracked. This should help with debugging. Also, I actually tested this version and it works for me.