download text file from url python - python

enter image description herei am using python 2.7 Flask for developing web app. i am trying to download a text file from specific URL and save in static folder in application .here is my code:
from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'static/forcast.txt'
fx = open(des_url,"w")
for line in lines:
fx.write(line+ "\n")
fx.close()
download(uurl)
now i run this and get following errors:
Traceback (most recent call last):
File "/Users/sanam/PycharmProjects/ff/ff.py", line 17, in <module>
download(uurl)
File "/Users/sanam/PycharmProjects/ff/ff.py", line 12, in download
fx = open(des_url,"w")
IOError: [Errno 2] No such file or directory: '/static/forcast.txt'

Nothing is wrong with your code as it downloads the file in the same directory of your python script. so specify the location of the folder.
from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'folder/forcast.csv'
fx = open(des_url,"w")
for line in lines:
fx.write(line+ "\n")
fx.close()
download(uurl)

from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'folder/add.txt'
with open(des_url,"w"):
for line in lines:
fx.write(line+ "\n")
download(uurl)

Related

PYTHON 3 | urlextract package | urlextract.cachefile.CacheFileError: Default cache file does not exist

I have this script from a few months ago which was working fine, but right now it gives me a weird error. The script is a simple one that extract URLs from emails.
it is working correctly in my test env but when I export it as an exe file it throws this error:
C:\Users\tyagi\Desktop\Localization_Download.exe
Extracting download links from outlook
Traceback (most recent call last):
File "main.py", line 62, in <module>
File "urlextract\urlextract_core.py", line 97, in __init__
File "urlextract\cachefile.py", line 61, in __init__
File "urlextract\cachefile.py", line 88, in _get_default_cache_file_path
urlextract.cachefile.CacheFileError: Default cache file does not exist
'C:\Users\tyagi\AppData\Local\Temp\_MEI146482\urlextract\data\tlds-alpha-by-domain.txt'!
[7456] Failed to execute script 'main' due to unhandled exception!
This is the script:
##############################################################
# interacting with outlook to fetch the URL & download all the files
#############################################################
print("Extracting download links from outlook")
from win32com.client import Dispatch
outlook = Dispatch("Outlook.Application").GetNamespace("MAPI")
root_folder = outlook.Folders.Item(2)
inbox_folder = root_folder.Folders.Item(2)
localisation_folder = inbox_folder.Folders['localisation']
messages = localisation_folder.items
bodylist = []
for mail in messages:
body_content = mail.body
bodylist.append(body_content)
####### exporting all outlook emails as a text file
with open("Emailfile.txt", 'w') as output:
for row in bodylist:
output.write(str(row) + '\n')
####### extracting target links from that text file
from urlextract import URLExtract
extractor = URLExtract()
finalUrlList = []
with open("Emailfile.txt") as file:
for line in file:
urls = extractor.find_urls(line,True)
finalUrlList.append(urls)
from pandas import DataFrame
df = DataFrame(finalUrlList,columns=['download urls'])
df = df[df['download urls'].notna()]
df.reset_index(drop=True, inplace=True)
running it as an administrator is not an option
In file catchefile.py line 73 change:
return os.path.join(os.path.dirname(__file__), self._DATA_DIR)
to
return '' #os.path.join(os.path.dirname(__file__), self._DATA_DIR)
and insert to path of main file this downloaded filetext: https://data.iana.org/TLD/tlds-alpha-by-domain.txt

urllib.openurl treating http links as local file addresses

Trying to write a really basic scraper for youtube video titles, using a csv of video links and beautiful soup. The script as it currently stands is:
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib
import csv
with open('url-titles-list.csv', 'wb') as csv_out:
fieldnames = ['url', 'title']
writer = csv.DictWriter(csv_out, fieldnames = fieldnames)
with open('url-nohttps-list.csv', 'rb') as csv_in:
reader = csv.DictReader(csv_in, fieldnames=['linkurls'])
writer.writeheader()
for row in reader:
link = row['linkurls']
with urllib.urlopen(link) as response:
html = response.read()
soup = BeautifulSoup(html, "html.parser")
name = soup.title.string
writer.writerow({'url': row['linkurls'], 'title': name})
This breaks at urllib.urlopen(link), with the following traceback making it look like the url type is not getting recognised correctly, and it's trying to open links as local files?
Traceback (most recent call last):
File "/Users/clarapouletty/Desktop/operation_find_yuzusho/fetcher.py", line 15, in <module>
with urllib.urlopen(link) as response:
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 87, in urlopen
return opener.open(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 213, in open
return getattr(self, name)(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 469, in open_file
return self.open_local_file(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 483, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] No such file or directory: 'linkurls'
Process finished with exit code 1
Any assistance much appreciated!

memory error when retrieving data from Songkick

I have built a scraper to retrieve concert data from songkick by using their api. However, it takes a lot of time to retrieve all the data from these artists. After scraping for approximately 15 hours the script is still running but the JSON file doesn’t change anymore. I interrupted the script and I checked if I could access my data with TinyDB. Unfortunately I get the following error. Does anybody know why this is happening?
Error:
('cannot fetch url', 'http://api.songkick.com/api/3.0/artists/8689004/gigography.json?apikey=###########&min_date=2015-04-25&max_date=2017-03-01')
8961344
Traceback (most recent call last):
File "C:\Users\rmlj\Dropbox\Data\concerts.py", line 42, in <module>
load_events()
File "C:\Users\rmlj\Dropbox\Data\concerts.py", line 27, in load_events
print(artist)
File "C:\Python27\lib\idlelib\PyShell.py", line 1356, in write
return self.shell.write(s, self.tags)
KeyboardInterrupt
>>> mydat = db.all()
Traceback (most recent call last):
File "<pyshell#0>", line 1, in <module>
mydat = db.all()
File "C:\Python27\lib\site-packages\tinydb\database.py", line 304, in all
return list(itervalues(self._read()))
File "C:\Python27\lib\site-packages\tinydb\database.py", line 277, in _read
return self._storage.read()
File "C:\Python27\lib\site-packages\tinydb\database.py", line 31, in read
raw_data = (self._storage.read() or {})[self._table_name]
File "C:\Python27\lib\site-packages\tinydb\storages.py", line 105, in read
return json.load(self._handle)
File "C:\Python27\lib\json\__init__.py", line 287, in load
return loads(fp.read(),
MemoryError
below you can find my script
import urllib2
import requests
import json
import csv
import codecs
from tinydb import TinyDB, Query
db = TinyDB('events.json')
def load_events():
MIN_DATE = "2015-04-25"
MAX_DATE = "2017-03-01"
API_KEY= "###############"
with open('artistid.txt', 'r') as f:
for a in f:
artist = a.strip()
print(artist)
url_base = 'http://api.songkick.com/api/3.0/artists/{}/gigography.json?apikey={}&min_date={}&max_date={}'
url = url_base.format(artist, API_KEY, MIN_DATE, MAX_DATE)
# url = u'http://api.songkick.com/api/3.0/search/artists.json?query='+artist+'&apikey=WBmvXDarTCEfqq7h'
try:
r = requests.get(url)
resp = r.json()
if(resp['resultsPage']['totalEntries']):
results = resp['resultsPage']['results']['event']
for x in results:
print(x)
db.insert(x)
except:
print('cannot fetch url',url);
load_events()
db.close()
print ("End of script")
MemoryError is a built in Python exception (https://docs.python.org/3.6/library/exceptions.html#MemoryError) so it looks like the process is out of memory and this isn't really related to Songkick.
This question probably has the information you need to debug this: How to debug a MemoryError in Python? Tools for tracking memory use?

File Create/Write Issue In Python

I'm trying to create and write to a file. I have the following code:
from urllib2 import urlopen
def crawler(seed_url):
to_crawl = [seed_url]
crawled=[]
while to_crawl:
page = to_crawl.pop()
page_source = urlopen(page)
s = page_source.read()
with open(str(page)+".txt","a+") as f:
f.write(s)
f.close()
return crawled
if __name__ == "__main__":
crawler('http://www.yelp.com/')
However, it returns the error:
Traceback (most recent call last):
File "/Users/adamg/PycharmProjects/NLP-HW1/scrape-test.py", line 29, in <module>
crawler('http://www.yelp.com/')
File "/Users/adamg/PycharmProjects/NLP-HW1/scrape-test.py", line 14, in crawler
with open("./"+str(page)+".txt","a+") as f:
IOError: [Errno 2] No such file or directory: 'http://www.yelp.com/.txt'
I thought that open(file,"a+") is supposed to create and write. What am I doing wrong?
If you want to use the URL as the basis for the directory, you should encode the URL. That way, slashes (among other characters) will be converted to character sequences which won't interfere with the file system/shell.
The urllib library can help with this.
So, for example:
>>> import urllib
>>> urllib.quote_plus('http://www.yelp.com/')
'http%3A%2F%2Fwww.yelp.com%2F'

getting Images from website in python - 'module' object is not callable

Here is my program and what I need is to collect png extention images from website and save then according their names. Here is the code :
from urllib.request import urlopen
from urllib.request import urlretrieve
import re
webpage = urlopen('http://www.techradar.com/news/internet/web/12-best-places-to-get-free-images-for-your-site-624818').read()
patFinderImage = re().compile('<img src="(.*)png" />')
filename = ("D:\test\test.json")
imgUrl = re.findall(patFinderImage, webpage)
print("now-----")
actually_download = False
if actually_download:
filename = imgUrl.split('/')[-1]
urlretrieve(imgUrl, filename)
# fileName = basename(urlsplit(imgUrl)[2])
data = open(filename,'wb')
data.write(data)
data.close()
Here is the error:
pydev debugger: starting
Traceback (most recent call last):
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\pydevd.py", line 1738, in <module>
debugger.run(setup['file'], None, None)
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\pydevd.py", line 1355, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\_pydev_execfile.py", line 38, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc) #execute the script
File "C:\Users\joh\workspace\new2\url2.py", line 15, in <module>
imgUrl = re().findall(patFinderImage, webpage)
TypeError: 'module' object is not callable
You have error in this line:
imgUrl = re().findall(patFinderImage, webpage)
Since re is a module not function it should be:
imgUrl = re.findall(patFinderImage, webpage)
But later you have another error.
Correct code is (I added .decode("utf-8")) - the content from read() is of type bytes so you need to convert it to a string before trying to decode it into a json object..
import re
from urllib.request import urlopen
from urllib.request import urlretrieve
webpage = urlopen('http://www.techradar.com/news/internet/web/12-best-places-to-get-free-images-for-your-site-624818').read().decode("utf-8")
patFinderImage = re.compile('<img src="(.*)png" />')
filename = ("/tmp/test.json.")
imgUrl = re.findall(patFinderImage, webpage)
print("now-----")
actually_download = False
if actually_download:
filename = imgUrl.split('/')[-1]
urlretrieve(imgUrl, filename)
# fileName = basename(urlsplit(imgUrl)[2])
data = open(filename,'wb')
data.write(data)
data.close()

Categories

Resources