Download multiple(consecutive) txt from multiple(consecutive day)URLs - python

I'd like to download multiple txt that saves data by day.
The address is like this:
http://100.200.100.200/cd200730.txt
I'd like to download those txt files with start and end data input. I've made it through until I get all URLs but haven't found a way to download and save each file with its name of the day. - "cd200730.txt", "cd200731.txt", etc
import requests
import pandas as pd
# set date_range to start and end
date_range=pd.date_range(start='2018-04-24', end='2018-04-27', freq='D')
df=date_range.strftime('%y%m%d')
df2=df.to_frame(index=False,name='date')
df2['date'] = df2['date'].apply(lambda x: f"http://100.200.100.200/cd{x}.txt")
for url in df2.date:
r = requests.get(url, allow_redirects=True)
open(url, 'wb').write(r.content)
When I run this, I get the following error:
OSError: [Errno 22] Invalid argument: 'http://10.47.149.67/cd180424.txt'
When I run it with changed last line,"open('url.txt'...)", I get only the last file.
I feel like I should make another for loop in the part of "open(url)".
Is there any I can complete this work?
---Edited--- v0.1
I've made it through as following:
import requests
import pandas as pd
# date_range to start and end
date_range=pd.date_range(start='2018-04-24', end='2018-04-25', freq='D')
df=date_range.strftime('%y%m%d')
df_filename=df.to_frame(index=False,name='file_name')
df_filename['file_name']=df_filename['file_name'].apply(lambda x: f"cd{x}.txt")
df2=df.to_frame(index=False,name='date')
df2['date'] = df2['date'].apply(lambda x: f"http://100.200.100.200/cd{x}.txt")
for url in df2.date:
r = requests.get(url, allow_redirects=False)
for name in df_filename['file_name']:
open(name, 'wb').write(r.content)
---Edited--- v0.2
"v0.1" only saves the same data with various date files (cd200718.csv and cd200719 have the same data)
Something little is missing..
---Edited--- v0.3
Finnaly, the following works perfectly!
for url,name in zip(df2.date,df_filename['file_name']):
r = requests.get(url, allow_redirects=False)
open(name, 'wb').write(r.content)

Related

Use Python to scrape images from xml tags

I am trying to write a short python program to download of copy of the xml jail roster for the local county save that that file, scrape and save all the names and image links in a csv file, then download each of the photos with the file name being the name.
I've managed to get the XML file, save it locally, and create the csv file. I was briefly able to write the full xml tag (tag and attribute) to the csv file, but can't seem to get just the attribute, or the image links.
from datetime import datetime
from datetime import date
import requests
import csv
import bs4 as bs
from bs4 import BeautifulSoup
# get current date
today = date.today()
# convert date to date-sort format
d1 = today.strftime("%Y-%m-%d")
# create filename variable
roster = 'jailroster' + '-' + d1 + '-dev' + '.xml'
# grab xml file from server
url = "fakepath.xml"
print("ATTEMPTING TO GET XML FILE FROM SERVER")
req_xml = requests.get(url)
print("Response code:", req_xml.status_code)
if req_xml.status_code == 200:
print("XML file downloaded at ", datetime.now())
soup = BeautifulSoup(req_xml.content, 'lxml')
# save xml file from get locally
with open(roster, 'wb') as file:
file.write(req_xml.content)
print('Saving local copy of XML as:', roster)
# read xml data from saved copy
infile = open(roster,'r')
contents = infile.read()
soup = bs.BeautifulSoup(contents,'lxml')
# variables needed for image list
images = soup.findAll('image1')
fname = soup.findAll('nf')
mname = soup.findAll('nm')
lname = soup.findAll('nl')
baseurl = 'fakepath.com'
with open('image-list.csv', 'w', newline='') as csvfile:
imagelist = csv.writer(csvfile, delimiter=',')
print('Image list being created')
imagelist.writerows(images['src'])
I've gone through about a half dozen tutorials trying to figure all this out, but I think this is the edge of what I have been able to learn so far and I haven't even started to try and figure out how to save the list of images as files. Can anyone help out with a pointer or two or point me towards tutorials on this?
Update: No this is not for a mugshot site or any unethical purposes. This data is for a private data project for a non-public public safety project.
This should get you the data you need:
from datetime import date
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extractor(tag: str) -> list:
return [i.getText() for i in soup.find_all(tag)]
url = "https://legacyweb.randolphcountync.gov/sheriff/jailroster.xml"
soup = BeautifulSoup(requests.get(url).text, features="lxml")
images = [
f"{'https://legacyweb.randolphcountync.gov'}{i['src'].lstrip('..')}"
for i in soup.find_all('image1')
]
df = pd.DataFrame(
zip(extractor("nf"), extractor("nm"), extractor("nl"), images),
columns=['First Name', 'Middle Name', 'Last Name', 'Mugshot'],
)
df.to_csv(
f"jailroster-{date.today().strftime('%Y-%m-%d')}-dev.csv",
index=False,
)
Sample output (a .csv file):

How to convert url data to csv using python

i am trying to download the data from the following url and tying to save it as csv data but the output i am getting is a text file. can anyone pls help what i am doing wrong here ? also, is it possible to add multiple url in the same script and download multiple csv files.
import csv
import pandas as pd
import requests
from datetime import datetime
CSV_URL = ('https://dsv-ops-toolkit.ihsmvals.com/ftp?config=fenics-bgc&file=IRSDATA_20211129_1700_Intra.csv&directory=%2FIRS%2FIntraday%2FDaily')
with requests.Session() as s:
download = s.get(CSV_URL)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
date =datetime.now().strftime('%y%m%d')
my_list = list(cr)
df=pd.DataFrame(my_list)
df.to_csv(f'RFR_{date}')
You can create a list of your necessary URLs like:
urls = ['http://url1.com','http://url2.com','http://url3.com']
Iterate through the list for each url and your requests will be as it is:
for each_url in urls:
with requests.Session() as s:
# your_code_here
Hope you'll find this helpful.

File Not Found Error while Downloading Image files

I am using Windows 8.1, so I have been web scraping a lot recently and have been very successful in finding out some errors as well, but now I am stuck in downloading the files as they will not download and giving me a
FileNotFoundError.
I have removed all the unknown characters from the name files but still, get this error. any help.
I have also made the names lowercase just in case. The error happens when I download the 22nd item, other items download fine before the 22nd one .
My Code and also the Excel file For reference:
import time
import pandas as pd
import requests
Final1 = pd.read_excel("Sneakers.xlsx")
Final1.index+=1
a = Final1.index.tolist()
Images = Final1["Images"].tolist()
Name = Final1["Name"].str.lower().tolist()
Brand = Final1["Brand"].str.lower().tolist()
s = requests.Session()
for i,n,b,l in zip(a,Name,Brand,Images):
r = s.get(l).content
with open("Images//" + f"{i}-{n}-{b}.jpg","wb") as f:
f.write(r)
Excel File (Google Drive) : Excel File
It seems like you don't have Images folder in your path.
It's better way to use os.path.join() function for joining path in python.
Try Below:
import os
import time
import pandas as pd
import requests
Final1 = pd.read_excel("Sneakers.xlsx")
Final1.index+=1
a = Final1.index.tolist()
Images = Final1["Images"].tolist()
Name = Final1["Name"].str.lower().tolist()
Brand = Final1["Brand"].str.lower().tolist()
# Added
if not os.path.exists("Images"):
os.mkdir("Images")
s = requests.Session()
for i,n,b,l in zip(a,Name,Brand,Images):
r = s.get(l).content
# with open("Images//" + f"{i}-{n}-{b}.jpg","wb") as f:
with open(os.path.join("Images", f"{i}-{n}-{b}.jpg"),"wb") as f:
f.write(r)

Python. Get file date and time (timestamp) from HTTP server

I need to use a Python program to download a file from an HTTP server while preserving the original timestamp of the file creation.
Accordingly, two questions:
How to get file date from HTTP server using Python 3.7?
How to set this date for the downloaded file?
You could have a look at requests to download the file and get the modification date from the headers.
To set the dates you can use os.utime and email.utils.parsedate for parsing the date from the headers (see this answer by tzot).
Here is an example:
import datetime
import os
import time
import requests
import email.utils as eut
url = 'http://www.hamsterdance.org/hamsterdance/index-Dateien/hamster.gif'
r = requests.get(url)
f = open('output', 'wb')
f.write(r.content)
f.close()
last_modified = r.headers['last-modified']
modified = time.mktime(datetime.datetime(*eut.parsedate(last_modified)[:6]).timetuple())
now = time.mktime(datetime.datetime.today().timetuple())
os.utime('output', (now, modified))

Download data from URL in Python 3.6

I want to download data from https://download.bls.gov/pub/time.series/ln/ln.data.1.AllData to dataframe.
I have tried below script, but could not succeeded.
import requests, io
import pandas as pd
URL = 'https://download.bls.gov/pub/time.series/ln/ln.data.1.AllData'
#1
urlData = requests.get(URL).content
rawData = pd.read_csv(io.StringIO(urlData.decode('utf-8')))
print(len(rawData))
Error: Python IDLE Got Stuck
#2
r = requests.get(URL)
urlData = pd.read_csv(io.StringIO(r))
print(len(urlData))
Error:-
urlData = pd.read_csv(io.StringIO(r))
TypeError: initial_value must be str or None, not Response
#3
urlData = pd.read_csv(URL, header=None)
print(len(urlData))
I got this working with
import requests, io
import pandas as pd
URL = 'https://download.bls.gov/pub/time.series/ln/ln.data.1.AllData'
#1
urlData = requests.get(URL).content
rawData = pd.read_csv(io.StringIO(urlData.decode('utf-8')), sep="\t")
print(rawData.head())
print(rawData.info())
Simplest way is to use urllib2.
import urllib2
url_name = 'http://abc.pdf'
response = urllib2.urlopen(url_name)
file = open(url_name.split('//')[1], 'w')
file.write(response.read())
file.close()
I tried to download the data through the URL, and it does take a very long time. I recommend you to download through wget and then process it. The script itself seems fine.

Categories

Resources