Python : Manga parsing return empty file - python

i want to parse images from a "certain" manga and chapter. here's my code:
import requests, bs4, os, urllib.request
try:
url = "http://manganelo.com/chapter/read_one_punch_man_manga_online_free3/chapter_136"
res = requests.get(url)
print("[+] Asking a request to " + url)
# slice the url so it only contains the name and chapter
name = url[34:].replace("/", "_")
os.mkdir(name)
print("[+] Making '{}' directory".format(name))
os.chdir(os.path.join(os.getcwd(), name))
soup = bs4.BeautifulSoup(res.text, "html.parser")
for img in soup.findAll("img"):
manga_url = img.get("src")
manga_name = img.get("alt") + ".jpg"
urllib.request.urlretrieve(manga_url, manga_name)
print("[+] Downloading: " + manga_name)
except Exception as e:
print("[-] Error: " + str(e))
it works fine BUT only for a specific chapter, let's say i put chapter 130, when i try to run the code it returns blank file but if i put chapter 136 or others it works fine. How can this happen?

you can replace urllib.request.urlretrieve(manga_url, manga_name)
with :
r = requests.get(manga_url, stream=True)
if r.status_code == 200:
with open(manga_name, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
Actually Remote server is apparently checking the user agent header and rejecting requests from Python's urllib.
On the other hand you can use :
opener = urllib.request.URLopener()
opener.addheader('User-Agent', 'whatever')
opener.retrieve(manga_url, manga_name)
This works for me
Hope this helps

Related

How to let multiple web address (http) point to the smae webpage [duplicate]

I am trying to capture http status code 3XX/302 for a redirection url. But I cannot get it because it gives 200 status code.
Here is the code:
import requests
r = requests.get('http://goo.gl/NZek5')
print r.status_code
I suppose this should issue either 301 or 302 because it redirects to another page. I had tried few redirecting urls (for e.g. http://fb.com ) but again it is issuing the 200. What should be done to capture the redirection code properly?
requests handles redirects for you, see redirection and history.
Set allow_redirects=False if you don't want requests to handle redirections, or you can inspect the redirection responses contained in the r.history list.
Demo:
>>> import requests
>>> url = 'https://httpbin.org/redirect-to'
>>> params = {"status_code": 301, "url": "https://stackoverflow.com/q/22150023"}
>>> r = requests.get(url, params=params)
>>> r.history
[<Response [301]>, <Response [302]>]
>>> r.history[0].status_code
301
>>> r.history[0].headers['Location']
'https://stackoverflow.com/q/22150023'
>>> r.url
'https://stackoverflow.com/questions/22150023/http-redirection-code-3xx-in-python-requests'
>>> r = requests.get(url, params=params, allow_redirects=False)
>>> r.status_code
301
>>> r.url
'https://httpbin.org/redirect-to?status_code=301&url=https%3A%2F%2Fstackoverflow.com%2Fq%2F22150023'
So if allow_redirects is True, the redirects have been followed and the final response returned is the final page after following redirects. If allow_redirects is False, the first response is returned, even if it is a redirect.
requests.get allows for an optional keyword argument allow_redirects which defaults to True. Setting allow_redirects to False will disable automatically following redirects, as follows:
In [1]: import requests
In [2]: r = requests.get('http://goo.gl/NZek5', allow_redirects=False)
In [3]: print r.status_code
301
This solution will identify the redirect and display the history of redirects, and it will handle common errors. This will ask you for your URL in the console.
import requests
def init():
console = input("Type the URL: ")
get_status_code_from_request_url(console)
def get_status_code_from_request_url(url, do_restart=True):
try:
r = requests.get(url)
if len(r.history) < 1:
print("Status Code: " + str(r.status_code))
else:
print("Status Code: 301. Below are the redirects")
h = r.history
i = 0
for resp in h:
print(" " + str(i) + " - URL " + resp.url + " \n")
i += 1
if do_restart:
init()
except requests.exceptions.MissingSchema:
print("You forgot the protocol. http://, https://, ftp://")
except requests.exceptions.ConnectionError:
print("Sorry, but I couldn't connect. There was a connection problem.")
except requests.exceptions.Timeout:
print("Sorry, but I couldn't connect. I timed out.")
except requests.exceptions.TooManyRedirects:
print("There were too many redirects. I can't count that high.")
init()
Anyone have the php version of this code?
r = requests.get(url)
if len(r.history) < 1:
print("Status Code: " + str(r.status_code))
else:
print("Status Code: 301. Below are the redirects")
h = r.history
i = 0
for resp in h:
print(" " + str(i) + " - URL " + resp.url + " \n")
i += 1
if do_restart:

Extract the Background Image Url of a website from html file set in the style in Python

I'm coding a website cloner in python, It is doing fine as well for most files but I have found a challenge in getting the url of background images eg
<div style="background-image: url(images/banner.jpg)" >
The script detects background-image as a folder and assume the url is 'background_image: url(images/banner.jpg' .How do I set it to get the actual url.
Python 2.7
import urllib2
import sys
import socket
import os
import re
socket.setdefaulttimeout(15)
dataTypesToDownload = [".jpg", ".jpeg", ".png", ".gif", ".ico", ".css", ".js", ".html"]
url = 'http://example.com/'
pathbase = 'theme'
if "http://" not in url and "https://" not in url:
url = "http://"+url
try:
os.mkdir(pathbase)
except OSError:
pass
file = open(pathbase + "/index.html", "w")
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
exit()
resources = re.split("=\"|='", content)
first = False
for resource in resources:
if first == False:
first = True
continue
resource = re.split("\"|'", resource)[0]
if any(s in resource for s in dataTypesToDownload):
print "Downloading " + resource
try:
path = resource.split("/")
if len(path) != 1:
path.pop(len(path) - 1)
trail = "./" + pathbase + "/"
for folder in path:
trail += folder+"/"
try:
os.mkdir(trail)
except OSError:
pass
except IOError:
pass
try:
if "?" in resource:
download = open(pathbase + "/"+resource.split("?")[len(resource.split("?")) - 2], "w")
else:
download = open(pathbase + "/"+resource, "w")
print url+"/"+resource
dContent = urllib2.urlopen(url+"/"+resource).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
download.close()
continue
except IOError:
pass
continue
download.write(dContent)
download.close()
print "Downloaded!"
file.write(content)
file.close()
I expect it when it encounter style="background-image: url(images/banner.jpg),
It should set resource to be images/banner.jpg. But it is setting the resource as background-image: url(images/images.jpg

InvalidSchema(“No connection adapters were found for '%s'” % url)

I'm trying to run my scraper, but there is a problem with urls. They are looking like this: //ocdn.eu/pul...
Error message:
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '/http:///http://...
Error raise at r = session.get line. Thanks for help!
for post in posts:
title = post.find('span', {'class': 'title'}).get_text()
link = post.find("a")['href']
image_source = post.find('img')['src']
image_source_solved = "http://".join(image_source)
# stackoverflow solution
media_root = '/Users/mat/Desktop/jspython/just django/dashboard/media_root'
if not image_source.startswith(("data:image", "javascript")):
local_filename = image_source.split('/')[-1].split("?")[0]
r = session.get(image_source_solved, stream=True, verify=False)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
current_image_absolute_path = os.path.abspath(local_filename)
shutil.move(current_image_absolute_path, media_root)
I changed this line:
image_source_solved = "http://".join(image_source)
for this line:
image_source_solved = "http:{}".format(image_source)

Python download videos from HTTP URL with bad internet

I have problem with downloading videos from my server e.g. http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Cornaro%20USLUGE.mp4
All works perfectly when internet is OK, but when I disconnect LAN cable from Raspberry Pi and stay like that less than 10-15 seconds. But when internet is off more than 10-15 seconds, my download does not continue or videos are not properly downloaded (I merge them later with MP4Box and they need to be). If someone has suggestion how to solve this problem and help me I would appreciate it very much.
Here is my code:
import os
import urllib
import urllib2
import time
import commands
import requests
import shutil
from urllib2 import URLError
urls = ['http://screensfiles.dbtouch.com/screens2/Companies/89/HD/00 APPS OVERVIEW.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Cornaro USLUGE.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/ILIRIJA BIOGRAD 2016.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Restoran marina.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/HT Screens.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Hotels Touch - Tasks.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Croatia Full of life.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/04 PROJECTS.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/05 ATTEND.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Cornaro Hotel.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Plurato dron snimka 2.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Plurato dron snimka 2.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Plurato dron snimka 2.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Cornaro USLUGE.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Cornaro USLUGE.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Hotels Touch - Screens.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Hotels Touch - Screens.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Hotels Touch - Tasks.mp4',
'http://screensfiles.dbtouch.com/screens2/Companies/89/HD/Hotels Touch - Screens.mp4']
directory = "/home/pi/pythonSignage/current_playlist/videos_to_merge/"
i=1
for url in urls:
i += 1
print("current iter: ")
print(i)
if (len(urls) > 1):
url_formatted = url.split('/')[-1].replace(" ", "").replace("%20", "") + " "
else:
url_formatted = url.split('/')[-1].replace(" ", "").replace("%20", "")
url_formatted_name = url.split('/')[-1].replace(" ", "").replace("%20", "").rstrip()
while True:
print("inside while true")
try:
""" method 0 doesn't work """
print("try")
response = urllib2.urlopen(url, timeout=5)
content = response.read()
print("content")
f = open(directory + url_formatted_name, 'wb')
f.write(content)
f.close()
""" method 1 doesn't work """
#video_fancy_downloader = urllib.FancyURLopener()
#video_fancy_downloader.retrieve(url, directory + url_formatted_name)
""" method 2 - doesn't work """
#my_file = urllib.URLopener()
#my_file = retrieve(url, directory + url_formatted_name)
""" method 3 - doesn't work """
#response = requests.get(url, stream=True)
#response.raise_for_status()
#with open(directory + url_formatted_name, 'wb') as handle:
# for block in response.iter_content(1024):
# handle.write(block)
except:
print("error download, sleep 5 sec")
time.sleep(5)
print("end")
I have managed to solve my problem. Maybe this is not best approach but it works.
Here is function for downloading video and returns response:
def do_download(destination, url):
comm = ["wget", "-c", "-O", destination, "-t", "15000", "-T", "5", url]
proc = subprocess.Popen(comm, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
tmp = proc.stdout.read()
if "wget: unable to resolve host address" in tmp:
return False
else:
return True
core part of downloading function is almost the same, but now it calls do_download inside while loop and checks for response:
if os.path.isfile(directory+url_formatted_name) is False:
print("must download file!")
while downl_success is False:
print("inside while true")
try:
print("try")
while(do_download(directory + url_formatted_name, url) is False):
print(" ------- inside while for download ----------- ")
time.sleep(5)
downl_success = True
print("file downloaded fully!")
break
except HTTPError, e:
print "HTTPError", e.code, url
time.sleep(5)
except URLError, e:
print "URL Error", e.reason, url
time.sleep(5)
else:
print("file already downloaded no need to download it again!")

python request urls parallel [duplicate]

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)
General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

Categories

Resources