I`m trying to download a book link from a different file but when I executed... the file at the beginning runs well but it crash
downloading: http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu
Traceback (most recent call last):
File "C:/Users/he/Desktop/New folder (4)/6.py", line 23, in <module>
exec(l)
File "<string>", line 1, in <module>
File "C:/Users/he/Desktop/New folder (4)/6.py", line 9, in download_url
with open(file_name, 'wb') as f:
OSError: [Errno 22] Invalid argument: '%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
Here is the code:
import requests
from bs4 import BeautifulSoup
def download_url(url):
print("downloading: ", url)
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
def n(f):
s = []
for l in range(1):
l = f.readline()
if l:
s.append(l)
return s
b = open('1.txt')
c = n(b)
while len(c) > 0:
for l in c:
exec(l)
c = n(b)
b.close()
File link:
download_url("http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
You have link with some chars converted to percent values - to create url safe for internet - but this is not correct string for filename because system may not use % in filenames.
But you can convert it back using urllib.parser.unquote()
import urllib.parse
name = urllib.parse.unquote("%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B
8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%
8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D
0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5
%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
print(name)
Result:
'Башта Т.М., и д р. - Гидравлика , гидромашины и гидропривод ы (Учебник для машиностроит ельных вузов).djvu'
There is a problem with your file name.
You cannot use %D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu As file name
I guess you want to use e5772a162f57b7c2b22c7ec8f6883002 as the file name, if not, you can choose another name.
If it's not you revising.
I have modified your code, I have successfully downloaded the file of your URL, and can view it without problems.
import requests
url = 'http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
def download_url(url):
print("downloading: ", url)
file_name = f"{url.split('/')[5]}.{url.split('.')[-1]}"
print(url.split('/')[5])
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
f.write(r.content)
download_url(url=url)
Related
I've written a script in python to get some tabular content from a webpage and write the same in a csv file. What I wish to do now is let my script write the content in a csv file only if the table (visible as Top Mutual Fund Holders) is available in that page otherwise it will remove the csv file that has been created.
The table is avaible in this webpage.
The table I'm looking for is not available in this webpage.
This is my try:
import os
import csv
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
os.remove("mutual_fund.csv")
return item_one
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
for elem in get_records(url):
print(elem)
I've tried with the link that do not have that table. However, it throws the following error
while deleting the csv file:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 33, in <module>
for elem in get_records(url):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 27, in get_records
os.remove("mutual_fund.csv")
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'mutual_fund.csv'
How can I delete the csv file when the tabular content is not present?
You are actually deleting the file when it is open to write.
You should change your main function accordingly.
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = None
return item_one
if __name__ == '__main__':
delete_file= False
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
try:
for elem in get_records(url):
print(elem)
except TypeError:
delete_file=True
if delete_file:
os.remove("mutual_fund.csv")
If you keep your existing logic as it is and delete the file when the content within csv is nothing then the following should work:
import os
import csv
import requests
from bs4 import BeautifulSoup
# url = "https://finance.yahoo.com/quote/fb/holders?p=FB"
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
f.close()
os.remove('mutual_fund.csv')
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
get_records(url)
I try to scrape data from kickstarter, the code is working but it gives the following error in page 15 (you might get get error in different page since webpage is dynamic):
Traceback (most recent call last): File "C:\Users\lenovo\kick.py",
line 30, in
csvwriter.writerow(row) File "C:\Users\lenovo\AppData\Local\Programs\Python\Python37\lib\encodings\cp1252.py",
line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0] UnicodeEncodeError: 'charmap' codec can't encode character '\uff5c' in
position 27: character maps to
What might be the issue? Any suggestion?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import csv
KICKSTARTER_SEARCH_URL = "https://www.kickstarter.com/discover/advanced?category_id=16&sort=newest&seed=2502593&page={}"
DATA_FILE = "kickstarter.csv"
csvfile = open(DATA_FILE, 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
page_start = 0
while True:
url = KICKSTARTER_SEARCH_URL.format(page_start)
print(url)
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
project_details_divs = soup.findAll('div', {"class":"js-react-proj-card"})
if len(project_details_divs) == 0:
break;
for div in project_details_divs:
project = json.loads(div['data-project'])
row = [project["id"],project["name"],project["goal"],project["pledged"]]
csvwriter.writerow(row)
page_start +=1
csvfile.close()
Add the argument encoding to your file-opener. I mean, change
csvfile = open(DATA_FILE, 'w')
into
csvfile = open(DATA_FILE, 'w', encoding='utf-8')
But the practice on that matter is rather to use a context manager
with open(DATA_FILE, 'w', encoding='utf-8') as csvfile:
# ...
I have a text file which contains a list of URLs and I am willing to print the contents of the URL in another text file, along with the URL as the header. I have used this project file https://pypi.org/project/Wikipedia-API/ to extract the content, but I would have to enter the link one after another, which I do not want to delve into, since my list is huge, with at least 3000 links per text file.
Can anyone help me with this, it would be highly appreciated.
EDIT:
I have tried this in the following way, but there is no content in the output txt file.
import urllib
import datetime as dt
from datetime import datetime
import time
linklist = []
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
linklist.append (i)
output = open('Wikipedia_content.txt', 'w', encoding='utf-8')
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
Here is the txt file that I have used https://pastebin.com/Y4bwsHGB , and this is the text file that I need to use : https://pastebin.com/SXDAu8jV.
Thanks in advance.
PROBLEM:
Traceback (most recent call last):
File "C:/Users/suva_/Desktop/Project specification/data/test2.py", line 13, in <module>
output_file.write((urlopen(link).read()))
File "D:\Python 36\lib\urllib\request.py", line 228, in urlopen
return opener.open(url, data, timeout)
File "D:\Python 36\lib\urllib\request.py", line 531, in open
response = self._open(req, data)
File "D:\Python 36\lib\urllib\request.py", line 554, in _open
'unknown_open', req)
File "D:\Python 36\lib\urllib\request.py", line 509, in _call_chain
result = func(*args)
File "D:\Python 36\lib\urllib\request.py", line 1389, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: https>
FINAL FIX:
import urllib
import datetime as dt
from datetime import datetime
import requests
import time
import re
import html2text
startTime = time.time()
def text_opener():
linklist=[]
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
try:
linklist.append(i.strip())
except UnicodeEncodeError as enror:
linklist.append ("")
return linklist
linklist = text_opener() # put the content in a list and then opened the text
'''
This is a string of characters which I wanted to remove from the URL content
rejectedChar = list('!"#$%&\'()*+,-./:;<=>?#[\\]^_`{|}~0123456789')
rejectedChar.append("\t")
special="\t"
regexWords = r"[\w']+"
'''
'''STOPWORDS LIST WHICH CONTAINS A BUNCH OF WORDS WHICH I DON"T NEED TO BE PRINTED--- ONLY FOR LARGE FILES
#stopwords = []
#with open('stopwords.txt', 'r', encoding='utf-8') as inFile:
# for i in inFile:
# stopwords.append(i.strip())
'''
content = ""
count = 0
for i in linklist:
print(count," ",i.encode('utf-8'))
count+=1
try:
f = urllib.request.urlopen(i).read()
content+=str(f)
except Exception as e:
continue
#print((linklist[0:4000]).encode('utf-8'))
#combinedstops= rejectedChar+stopwords # combining them together
#for item in combinedstops:
#content=content.replace(item,"") # now this items are removed from the
#content
def output_file (content):
with open('June_wikipedia_content.txt', 'w', encoding = 'utf-8') as output:
output.write(str(content))
## try:
## output_file (content)
## except UnicodeEncodeError as enror:
## print ("Got lost in the game")
#sky=open("sky.txt",'w')
#sky.write(str(content))
output_file (content)
#print("hahahahahaha",stopwords)
#for i in content:
# i = re.findall(regexWords, i)
# i = [i for i in i if i in stopwords]
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
You can use the following function to open the text file and store all the links in a list:
with open('links.txt') as f:
content = f.read().splitlines()
The variable content is a list with each element containing the string associated with a URL. This will only work though if your links.txt has the URL's arranged line by line i.e:
www.google.co.in
www.wikipedia.co.in
www.youtube.co.in
Once you get this list you can iterate through it with a simple for loop and do what you desire.
If you want a more detailed answer I suggest posting an example text file of the links.
EDIT :
This works but it dumps the whole data into the file. The data is not formatted correctly. Is this what you need ?
from urllib.request import urlopen
with open('links.txt') as f:
content = f.read().splitlines()
with open('Wikipedia_content.txt', 'w') as output_file:
for link in content :
output_file.write(link)
output_file.write((urlopen(link).read()))
I'm having problem with my app in python my app is for downloading videos from the web at a specified time. my program name is tidopy.py
but I get this Error:
Traceback (most recent call last):
File "tidopy.py", line 29, in
file.write(data)
TypeError: argument 1 must be string or buffer, not instance
I have problem with this part:
while (coun > x):
file = open(namelist[x], 'wb')
file.write(urllib2.urlopen(addresslist[x])).read()
file.close()
x = x + 1
x is a variable for the number of videos.
namelist is a list for the name of videos.
addresslist is a list for the address of web videos
How can I fix it?
please help.
Here is a simple code to perform a download from a list.
import requests
import shutil
namelist = [...]
addresslist = [...]
for k, x in enumerate(namelist):
r = requests.get(x, stream=True)
if r.ok:
with open(addresslist[k], 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
So I have the following lines of code in a function
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
and they work fine when I call the function by hand. However, when I call the function in a loop (using the same urls as earlier) I get the following error:
> Traceback (most recent call last):
File "./headlines.py", line 256, in <module>
main(argv[1:])
File "./headlines.py", line 37, in main
write_articles(headline, output_folder + "articles_" + term +"/")
File "./headlines.py", line 232, in write_articles
print get_blogs(headline, 5)
File "/Users/michaelnussbaum08/Documents/College/Sophmore_Year/Quarter_2/Innovation/Headlines/_code/get_content.py", line 41, in get_blogs
sock = urllib.urlopen(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 87, in urlopen
return opener.open(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 203, in open
return getattr(self, name)(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 314, in open_http
if not host: raise IOError, ('http error', 'no host given')
IOError: [Errno http error] no host given
Any ideas?
Edit more code:
def get_blogs(term, num_results):
search_term = term.replace(" ", "+")
print "search_term: " + search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
print "url: " +url
#error occurs on line below
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
def write_articles(headline, output_folder, num_articles=5):
#calls get_blogs
if not os.path.exists(output_folder):
os.makedirs(output_folder)
output_file = output_folder+headline.strip("\n")+".txt"
f = open(output_file, 'a')
articles = get_articles(headline, num_articles)
blogs = get_blogs(headline, num_articles)
#NEW FUNCTION
#the loop that calls write_articles
for term in trend_list:
if do_find_max == True:
fill_search_term(term, output_folder)
headlines = headline_process(term, output_folder, max_headlines, do_find_max)
for headline in headlines:
try:
write_articles(headline, output_folder + "articles_" + term +"/")
except UnicodeEncodeError:
pass
I had this problem when a variable I was concatenating with the url, in your case search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
had a newline character at the end. So make sure you do
search_term = search_term.strip()
You might also want to do
search_term = urllib2.quote(search_term)
to make sure your string is safe for a url
use urllib2 instead if you don't want to handle reading on a per block basis yourself.
This probably does what you expect.
import urllib2
req = urllib2.Request(url='http://stackoverflow.com/')
f = urllib2.urlopen(req)
print f.read()
In your function's loop, right before the call to urlopen, perhaps put a print statement:
print(url)
sock = urllib.urlopen(url)
This way, when you run the script and get the IOError, you will see the url which is causing the problem. The error "no host given" can be replicated if url equals something like 'http://'...