Trying to write a really basic scraper for youtube video titles, using a csv of video links and beautiful soup. The script as it currently stands is:
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib
import csv
with open('url-titles-list.csv', 'wb') as csv_out:
fieldnames = ['url', 'title']
writer = csv.DictWriter(csv_out, fieldnames = fieldnames)
with open('url-nohttps-list.csv', 'rb') as csv_in:
reader = csv.DictReader(csv_in, fieldnames=['linkurls'])
writer.writeheader()
for row in reader:
link = row['linkurls']
with urllib.urlopen(link) as response:
html = response.read()
soup = BeautifulSoup(html, "html.parser")
name = soup.title.string
writer.writerow({'url': row['linkurls'], 'title': name})
This breaks at urllib.urlopen(link), with the following traceback making it look like the url type is not getting recognised correctly, and it's trying to open links as local files?
Traceback (most recent call last):
File "/Users/clarapouletty/Desktop/operation_find_yuzusho/fetcher.py", line 15, in <module>
with urllib.urlopen(link) as response:
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 87, in urlopen
return opener.open(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 213, in open
return getattr(self, name)(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 469, in open_file
return self.open_local_file(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 483, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] No such file or directory: 'linkurls'
Process finished with exit code 1
Any assistance much appreciated!
Related
I am trying to extract the contents of a table within a pdf using PyPDF2 however I am encountering this error when trying to open the pdf and I am not sure why. How can I fix this? Here is the code:
#PDF Table testing
pdf_file = open(r"PDFs/murrumbidgee/Murrumbidgee Unregulated River Water Sources 2012_20200815.pdf")
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(50)
page_content = page.extractText()
print(page_content.encode('utf-8'))
table_list = page_content.split('\n')
l = numpy.array_split(table_list, len(table_list)/7)
for i in range(0, 5):
print(l[i])
This is the error:
PdfReadWarning: PdfFileReader stream/file object is not in binary mode. It may not be read correctly. [pdf.py:1079]
Traceback (most recent call last):
File "C:/Users/benjh/Desktop/project/testing_regex.py", line 103, in <module>
read_pdf = PyPDF2.PdfFileReader(pdf_file)
File "C:\Users\benjh\anaconda3\envs\project\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\benjh\anaconda3\envs\project\lib\site-packages\PyPDF2\pdf.py", line 1689, in read
stream.seek(-1, 2)
io.UnsupportedOperation: can't do nonzero end-relative seeks
What does nonzero end-relative seeks mean?
Opening the pdf with 'rb' fixes the error
I am trying to remove the HTML tags from some documents in a .txt format. However, there seems to be an error with the bs4 as far as I understand. The error that I am getting is the following:
Traceback (most recent call last):
File "E:/Google Drive1/Thesis stuff/Python/database/get_missing_10ks.py", line 13, in <module>
text = BeautifulSoup(file_read, "html.parser")
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\site-packages\bs4\__init__.py", line 282, in __init__
self._feed()
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\site-packages\bs4\__init__.py", line 343, in _feed
self.builder.feed(self.markup)
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\site-packages\bs4\builder\_htmlparser.py", line 247, in feed
parser.feed(markup)
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\html\parser.py", line 111, in feed
self.goahead(0)
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\html\parser.py", line 179, in goahead
k = self.parse_html_declaration(i)
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\html\parser.py", line 264, in parse_html_declaration
return self.parse_marked_section(i)
File "C:\Users\Adrian PC\AppData\Local\Programs\Python\Python37\lib\_markupbase.py", line 160, in parse_marked_section
if not match:
UnboundLocalError: local variable 'match' referenced before assignment
And the code that I am using is the following:
import os
from bs4 import BeautifulSoup
path_to_10k = "D:/10ks/list_missing_10k/"
path_to_saved_10k = "D:/10ks/list_missing_10kp/"
list_txt = os.listdir(path_to_10k)
for name in list_txt:
file = open(path_to_10k + name, "r+", encoding="utf-8")
file_read = file.read()
text = BeautifulSoup(file_read, "html.parser")
text = text.get_text("\n")
file2 = open(path_to_saved_10k + name, "w+", encoding="utf-8")
file2.write(str(text))
file2.close()
file.close()
The thing is that I have used this method on 51320 documents and it worked just fine, however, there are a few documents which it cannot do. When I open those HTML documents they seem the same to me.. If anyone could have any indication of what could be the problem and how to fix it it would be great. Thank you!
EXAMPLE OF FILE: https://files.fm/u/2s45uafp
https://github.com/scrapy/w3lib
https://w3lib.readthedocs.io/en/latest/
pip install w3lib
and
from w3lib.html import remove_tags
And then remove_tags(data) return clear data.
Here is a solution which is using regular expression for removing the HTML tags.
import re
TAG_RE = re.compile(r'<[^>]+>')
f = open("C:\Temp\Data.txt", "r")
strHtml=f.read()
def remove_Htmltags(text):
return TAG_RE.sub('', text)
strClearText=remove_Htmltags(strHtml)
print(strClearText)
I'm having a hard time reading a pdf from the internet into the python PdfFileReader object.
My code works for the first url, but it doesn't for the second and I don't know how to fix it.
I can see that in the first example, the url refers to a .pdf file and in the second url the pdf is being returned as 'application data' in the html body.
So I think this this might be the issue. Does anybody knows how to fix it so the code also works for the second url?
from pyPdf import PdfFileWriter, PdfFileReader
from io import BytesIO
import requests
def test(url,filename):
response=requests.get(url)
pdf_file = BytesIO(response.content)
existing_pdf = PdfFileReader(pdf_file)
page = existing_pdf.getPage(0)
output = PdfFileWriter()
output.addPage(page)
outputStream = file(filename, "wb")
output.write(outputStream)
outputStream.close()
test('https://s21.q4cdn.com/374334112/files/doc_downloads/test.pdf','works.pdf')
test('https://eservices.minfin.fgov.be/mym-api-rest/finform/pdf/2057','crashes.pdf')
This is the stacktrace I have with the second call of the test function:
D:\scripts>test.py
Traceback (most recent call last):
File "D:\scripts\test.py", line 21, in <module>
test('https://eservices.minfin.fgov.be/mym-api-rest/finform/pdf/2057','crashes.pdf')
File "D:\scripts\test.py", line 10, in test
page = existing_pdf.getPage(0)
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 450, in getPage
self._flatten()
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 596, in _flatten
catalog = self.trailer["/Root"].getObject()
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 480, in __getitem__
return dict.__getitem__(self, key).getObject()
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 165, in getObject
return self.pdf.getObject(self).getObject()
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 655, in getObject
raise Exception, "file has not been decrypted"
Exception: file has not been decrypted
I found a solution. I imported PyPDF2 instead of pyPdf, so it was probably a bug.
enter image description herei am using python 2.7 Flask for developing web app. i am trying to download a text file from specific URL and save in static folder in application .here is my code:
from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'static/forcast.txt'
fx = open(des_url,"w")
for line in lines:
fx.write(line+ "\n")
fx.close()
download(uurl)
now i run this and get following errors:
Traceback (most recent call last):
File "/Users/sanam/PycharmProjects/ff/ff.py", line 17, in <module>
download(uurl)
File "/Users/sanam/PycharmProjects/ff/ff.py", line 12, in download
fx = open(des_url,"w")
IOError: [Errno 2] No such file or directory: '/static/forcast.txt'
Nothing is wrong with your code as it downloads the file in the same directory of your python script. so specify the location of the folder.
from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'folder/forcast.csv'
fx = open(des_url,"w")
for line in lines:
fx.write(line+ "\n")
fx.close()
download(uurl)
from urllib2 import urlopen
uurl = 'http://textfiles.com/100/ad.txt'
def download(t_url):
response = urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
des_url = 'folder/add.txt'
with open(des_url,"w"):
for line in lines:
fx.write(line+ "\n")
download(uurl)
Here is my program and what I need is to collect png extention images from website and save then according their names. Here is the code :
from urllib.request import urlopen
from urllib.request import urlretrieve
import re
webpage = urlopen('http://www.techradar.com/news/internet/web/12-best-places-to-get-free-images-for-your-site-624818').read()
patFinderImage = re().compile('<img src="(.*)png" />')
filename = ("D:\test\test.json")
imgUrl = re.findall(patFinderImage, webpage)
print("now-----")
actually_download = False
if actually_download:
filename = imgUrl.split('/')[-1]
urlretrieve(imgUrl, filename)
# fileName = basename(urlsplit(imgUrl)[2])
data = open(filename,'wb')
data.write(data)
data.close()
Here is the error:
pydev debugger: starting
Traceback (most recent call last):
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\pydevd.py", line 1738, in <module>
debugger.run(setup['file'], None, None)
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\pydevd.py", line 1355, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Users\joh\Downloads\eclipse\plugins\org.python.pydev_3.3.3.201401272249\pysrc\_pydev_execfile.py", line 38, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc) #execute the script
File "C:\Users\joh\workspace\new2\url2.py", line 15, in <module>
imgUrl = re().findall(patFinderImage, webpage)
TypeError: 'module' object is not callable
You have error in this line:
imgUrl = re().findall(patFinderImage, webpage)
Since re is a module not function it should be:
imgUrl = re.findall(patFinderImage, webpage)
But later you have another error.
Correct code is (I added .decode("utf-8")) - the content from read() is of type bytes so you need to convert it to a string before trying to decode it into a json object..
import re
from urllib.request import urlopen
from urllib.request import urlretrieve
webpage = urlopen('http://www.techradar.com/news/internet/web/12-best-places-to-get-free-images-for-your-site-624818').read().decode("utf-8")
patFinderImage = re.compile('<img src="(.*)png" />')
filename = ("/tmp/test.json.")
imgUrl = re.findall(patFinderImage, webpage)
print("now-----")
actually_download = False
if actually_download:
filename = imgUrl.split('/')[-1]
urlretrieve(imgUrl, filename)
# fileName = basename(urlsplit(imgUrl)[2])
data = open(filename,'wb')
data.write(data)
data.close()