cannot use a string pattern on a bytes-like object (Python) - python

i'm creating a crawler in python to list all links in a website but i'm getting an error i can't see what cause it
the error is :
Traceback (most recent call last):
File "vul_scanner.py", line 8, in <module>
vuln_scanner.crawl(target_url)
File "C:\Users\Lenovo x240\Documents\website\website\spiders\scanner.py", line 18, in crawl
href_links= self.extract_links_from(url)
File "C:\Users\Lenovo x240\Documents\website\website\spiders\scanner.py", line 15, in extract_links_from
return re.findall('(?:href=")(.*?)"', response.content)
File "C:\Users\Lenovo x240\AppData\Local\Programs\Python\Python38\lib\re.py", line 241, in findall
return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
my code is : in scanner.py file:
# To ignore numpy errors:
# pylint: disable=E1101
import urllib
import requests
import re
from urllib.parse import urljoin
class Scanner:
def __init__(self, url):
self.target_url = url
self.target_links = []
def extract_links_from(self, url):
response = requests.get(url)
return re.findall('(?:href=")(.*?)"', response.content)
def crawl(self, url):
href_links= self.extract_links_from(url)
for link in href_links:
link = urljoin(url, link)
if "#" in link:
link = link.split("#")[0]
if self.target_url in link and link not in self.target_links:
self.target_links.append(link)
print(link)
self.crawl(link)
in vul_scanner.py file :
import scanner
# To ignore numpy errors:
# pylint: disable=E1101
target_url = "https://www.amazon.com"
vuln_scanner = scanner.Scanner(target_url)
vuln_scanner.crawl(target_url)
the command i run is : python vul_scanner.py

return re.findall('(?:href=")(.*?)"', response.content)
response.content in this case is of type binary. So either you use response.text, so you get pure text and can process it as you plan on doing now, or you can check this out:
Regular expression parsing a binary file?
In case you want to continue down the binary road.
Cheers

Related

delete image using msg_id in python?

I have a camera whose picture is achieve using the IP address of the camera in the web browser.
I can download the image link then and then download the image to my local system.
After that, I have to delete the image using msg_id, token, parameter.
I added the image link for delete using msg_id.
from time import sleep
import os
import sys
import requests
from bs4 import BeautifulSoup
import piexif
import os
from fractions import Fraction
archive_url = "http://192.168.42.1/SD/AMBA/191123000/"
def get_img_links():
# create response object
r = requests.get(archive_url)
# create beautiful-soup object
soup = BeautifulSoup(r.content,'html5lib')
# find all links on web-page
links = soup.findAll('a')
# filter the link sending with .mp4
img_links = [archive_url + link['href'] for link in links if link['href'].endswith('JPG')]
return img_links
def FileDelete():
FilesToProcess = get_img_links()
print(FilesToProcess)
FilesToProcessStr = "\n".join(FilesToProcess)
for FileTP in FilesToProcess:
tosend = '{"msg_id":1281,"token":%s,"param":"%s"}' %(token, FileTP)
print("Delete successfully")
Getting this error:
NameError: name 'token' is not defined
runfile('D:/EdallSystem/socket_pro/pic/hy/support.py', wdir='D:/EdallSystem/socket_pro/pic/hy')
['http://192.168.42.1/SD/AMBA/191123000/13063800.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13064200.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13064600.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13065000.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13065400.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13065800.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13072700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13073100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13073500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13073900.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13074300.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13074700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13075100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13075500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13075900.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13080300.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13080700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13081100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13081500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13081900.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13082300.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13082700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13083100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13083500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13083900.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13084300.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13084700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13085100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13085500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13085900.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13090300.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13090700.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13091100.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13091500.JPG', 'http://192.168.42.1/SD/AMBA/191123000/13091900.JPG']
Traceback (most recent call last):
File "D:\EdallSystem\socket_pro\pic\hy\support.py", line 82, in <module>
FileDelete()
File "D:\EdallSystem\socket_pro\pic\hy\support.py", line 74, in FileDelete
tosend = '{"msg_id":1281,"token":%s,"param":"%s"}' %( FileTP)
TypeError: not enough arguments for format string

python's beautiful soup module giving error

I am using the following code in an attempt to do webscraping .
import sys , os
import requests, webbrowser,bs4
from PIL import Image
import pyautogui
p = requests.get('http://www.goal.com/en-ie/news/ozil-agent-eviscerates-jealous-keown-over-stupid-comments/1javhtwzz72q113dnonn24mnr1')
n = open("exml.txt" , 'wb')
for i in p.iter_content(1000) :
n.write(i)
n.close()
n = open("exml.txt" , 'r')
soupy= bs4.BeautifulSoup(n,"html.parser")
elems = soupy.select('img[src]')
for u in elems :
print (u)
so what I am intending to do is to extract all the image links that is there in the xml response obtained from the page .
(Please correct me If I am wrong in thinking that requests.get returns the whole static html file of the webpage that opens on entering the URL)
However in the line :
soupy= bs4.BeautifulSoup(n,"html.parser")
I am getting the following error :
Traceback (most recent call last):
File "../../perl/webscratcher.txt", line 24, in <module>
soupy= bs4.BeautifulSoup(n,"html.parser")
File "C:\Users\Kanishc\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4\__init__.py", line 191, in __init__
markup = markup.read()
File "C:\Users\Kanishc\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 24662: character maps to <undefined>
I am clueless about the error and the "Appdata" folder is empty .
How to proceed further ?
Post Trying suggestions :
I changed the extension of the filename to py and this error got removed . However on the following line :
soupy= bs4.BeautifulSoup(n,"lxml") I am getting the following error :
Traceback (most recent call last):
File "C:\perl\webscratcher.py", line 23, in
soupy= bs4.BeautifulSoup(p,"lxml")
File "C:\Users\PREMRAJ\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4_init_.py", line 192, in init
elif len(markup) <= 256 and (
TypeError: object of type 'Response' has no len()
How to tackle this ?
You are over-complicating things. Pass the bytes content of a Response object directly into the constructor of the BeautifulSoup object, instead of writing it to a file.
import requests
from bs4 import BeautifulSoup
response = requests.get('http://www.goal.com/en-ie/news/ozil-agent-eviscerates-jealous-keown-over-stupid-comments/1javhtwzz72q113dnonn24mnr1')
soup = BeautifulSoup(response.content, 'lxml')
for element in soup.select('img[src]'):
print(element)
Okay so you you might want to do a review on working with BeautifulSoup. I referenced an old project of mine and this is all you need for printing them. Check the BS documents to find the exact syntax you want with the select method.
This will print all the img tags from the html
import requests, bs4
site = 'http://www.goal.com/en-ie/news/ozil-agent-eviscerates-jealous-keown-over-stupid-comments/1javhtwzz72q113dnonn24mnr1'
p = requests.get(site).text
soupy = bs4.BeautifulSoup(p,"html.parser")
elems = soupy.select('img[src]')
for u in elems :
print (u)

Finding Mac address for Python3 [duplicate]

I am trying to learn how to automatically fetch urls from a page. In the following code I am trying to get the title of the webpage:
import urllib.request
import re
url = "http://www.google.com"
regex = r'<title>(,+?)</title>'
pattern = re.compile(regex)
with urllib.request.urlopen(url) as response:
html = response.read()
title = re.findall(pattern, html)
print(title)
And I get this unexpected error:
Traceback (most recent call last):
File "path\to\file\Crawler.py", line 11, in <module>
title = re.findall(pattern, html)
File "C:\Python33\lib\re.py", line 201, in findall
return _compile(pattern, flags).findall(string)
TypeError: can't use a string pattern on a bytes-like object
What am I doing wrong?
You want to convert html (a byte-like object) into a string using .decode, e.g. html = response.read().decode('utf-8').
See Convert bytes to a Python String
The problem is that your regex is a string, but html is bytes:
>>> type(html)
<class 'bytes'>
Since python doesn't know how those bytes are encoded, it throws an exception when you try to use a string regex on them.
You can either decode the bytes to a string:
html = html.decode('ISO-8859-1') # encoding may vary!
title = re.findall(pattern, html) # no more error
Or use a bytes regex:
regex = rb'<title>(,+?)</title>'
# ^
In this particular context, you can get the encoding from the response headers:
with urllib.request.urlopen(url) as response:
encoding = response.info().get_param('charset', 'utf8')
html = response.read().decode(encoding)
See the urlopen documentation for more details.
Based upon last one, this was smimple to do when pdf read was done .
text = text.decode('ISO-8859-1')
Thanks #Aran-fey

Unable to use re.compile()

This program is a very simple example of webscraping. The programs goal is to go on the internet, find a specific stock, and then tell the user the price that the stock is currently trading at. However, I run into the issue in the code that when I compile it, this error message comes up:
Traceback (most recent call last):
File "HTMLwebscrape.py", line 15, in <module>
price = re.findall(pattern,htmltext)
File "C:\Python34\lib\re.py", line 210, in findall
return _compile(pattern, flags).findall(string)
TypeError: can't use a string pattern on a bytes-like object
Below is the actual script of the program. I've tried finding ways to solve this code, but so far, I've been unable to. I've been running this on Python 3 and using Submlime Text as my text editor. Thank you in advance!
import urllib
import re
from urllib.request import Request, urlopen
from urllib.error import URLError
symbolslist = ["AAPL","SPY","GOOG","NFLX"]
i=0
while i < len(symbolslist):
url = Request("http://finance.yahoo.com/q?s=AAPL&ql=1")
htmlfile = urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_184_'+symbolslist[i] + '"">(.+?)</span>'
pattern = re.compile(regex)
price = re.findall(pattern,htmltext)
print (price)
i+=1

Error for Python 3.4.1 regarding string pattern and bytes-like object

I'm new to Python and stack overflow.
I'm trying to follow a tutorial on youtube (outdated I'm guessing based on the error I get) regarding fetching stock prices.
Here is the following program:
import urllib.request
import re
html = urllib.request.urlopen('http://finance.yahoo.com/q?uhb=uh3_finance_vert_gs_ctrl2&fr=&type=2button&s=AAPL')
htmltext = html.read()
regex = '<span id="yfs_l84_aapl">.+?</span>'
pattern = re.compile(regex)
price = re.findall(pattern, htmltext)
print(price)
Since this is Python 3, I had to research on urllib.request and use those methods instead of a simple urllib.urlopen.
Anyways, when I run it, I get the following error:
Traceback (most recent call last):
File "/Users/Harshil/Desktop/stockFetch.py", line 13, in <module>
price = re.findall(pattern, htmltext)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/re.py", line 206, in findall
return _compile(pattern, flags).findall(string)
TypeError: can't use a string pattern on a bytes-like object
I realize the error and attempted to fix it by adding the following:
codec = html.info().get_param('charset', 'utf8')
htmltext = html.decode(codec)
But it gives me another error:
Traceback (most recent call last):
File "/Users/Harshil/Desktop/stockFetch.py", line 9, in <module>
htmltext = html.decode(codec)
AttributeError: 'HTTPResponse' object has no attribute 'decode'
Hence, after spending reasonable amount of time, I don't know what to do. All I want to do is get the price for AAPL so I can further continue to build a general program to fetch prices for an array of stocks and use the prices in future programs.
Any help is appreciated. Thanks!
You are barking up the right tree. Try decoding the actual HTML byte string rather than the urlopen HTTPResponse:
htmltext = html.read()
codec = html.info().get_param('charset', 'utf8')
htmltext = htmltext.decode(codec)
price = re.findall(pattern, htmltext)

Categories

Resources