I'm sure I am missing something quite trivial here so I have the below code
date1 = re.findall('0000(.*)', date1.encode('utf-8'))
str1 = '-'.join(date1)
print str1
print type(str1)
dt = datetime.strptime(str1,"%B %d, %Y ")
and I get an error of
ValueError: time data '' does not match format '%B %d, %Y '
it seems as if str1 is empty so I checked it with
print str1
print type(str1)
and get the following results
October 24, 2014
<type 'str'>
I cant work out why it thinks str1 is empty any ideas?
Appended full code
from bs4 import BeautifulSoup
import wikipedia
import re
from datetime import datetime
acq = wikipedia.page('List_of_mergers_and_acquisitions_by_Google')
test = acq.html()
#print test
##html = acq.html()
soup = BeautifulSoup(test)
table = soup.find('table', {'class' : 'wikitable sortable'})
company = ""
date1 = ""
for row in table.findAll('tr'):
cells = row.findAll('td')
if len(cells) == 8:
date1 = cells[1].get_text()
company = cells[2].get_text()
##print date
date1 = re.findall('0000(.*)', date1)
str1 = ''.join(date1)
print str1
print type(str1)
dt = datetime.strptime(str1,"%B %d, %Y ")
Related
I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...
I am appending a .csv file with python. The data is scraped from the web. I am through with almost everything related to scraping.
The problem is coming when I am trying to append the file. It enters multiple >100s of entries of same data. So I am sure there is a problem with the loop/ for or if statements that i am not able to identify and solve.
The condition checks for similarity in data scraped from web and already existing data in file.
If data doesn't match then program writes a new row, else it breaks or continues.
Note: csvFileArray is an array which checks data from existing file.txt. for example print(csvFileArray[0]) gives:
{'Date': '19/05/21', 'Time': '14:51:00', 'Status': 'Waitlisted', 'School': 'MIT Sloan', 'Details': 'GPA: 3.4 Round: Round 2 | Texas'}
Below is the code that has a problem.
file = open('file.csv', 'a')
writer = csv.writer(file)
#loop for page numbers
for page in range(15, 17):
print("Getting page {}..".format(page))
params["paged"] = page
data = requests.post(url, data=params).json()
soup = BeautifulSoup(data["markup"], "html.parser")
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
for firstentry in csvFileArray:
condition = (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
if condition:
continue
else:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
print("-" * 80)
file.close()
I'm guessing you want to write the line only if the condition is true for ALL of the csvFileArray entries. Right now, you're writing it for EVERY csvFileArray that doesn't match.
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
should_write = True
for firstentry in csvFileArray:
if (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True))):
should_write = False
break
if should_write:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
You could also use a list comprehension for this, but because your condition is large, that gets hard to read:
if not any(
(((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
for firstentry in csvFileArray):
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
I have the following code that requires the user to input a date in format 2021/12/31. It then calculates the difference between date entered and today.
date = input('Enter your date: ')
delta = datetime.strptime(date, '%Y/%m/%d') - datetime.now()
print("difference is {} days".format(delta.days))
I would like for a 0 to be displayed if the wrong date format is entered but can't quite figure it out, I assume I need if / else like my attempt below but it's not quite working as it returns 0 no matter what.
date = input('Enter your date: ')
if date == '%Y/%m/%d':
delta = datetime.strptime(date, '%Y/%m/%d') - datetime.now()
print("difference is {} days".format(delta.days))
else:
print('0')
You can use the datetime to check if the format is right like:
try:
....datetime.datetime.strptime("99/99/99","%m/%d/%y")
except ValueError as err:
....print(err)
You can check the date format like this:
from datetime import datetime
date = str(input('Enter your date: '))
print(date)
format = "%Y-%m-%d"
try:
delta = datetime.now() - datetime.strptime(date, format)
print("difference is {} days".format(delta.days))
except ValueError as err:
print(err)
Goal is the use datetime to reiterate over
http://www.harness.org.au/racing/results/?firstDate=01-01-2019
http://www.harness.org.au/racing/results/?firstDate=02-01-2019.... to yesterdays date
(should be done in new_url = base_url + str(enddate1))
then once in that href, i want to circulate over meetingfulllisttable to get name and href to then get results data from each track that day.
My current error is'<=' not supported between instances of 'datetime.timedelta' and 'str' - which comes from my while loop. why is this? never used datetime before
from datetime import datetime, date, timedelta
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = BeautifulSoup(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
yesterday1 = yesterday.strftime(format)
enddate = datetime(2019, 1, 1)
enddate1 = enddate.strftime(format)
while enddate1 <= yesterday1:
enddate1 =+ timedelta(days=1)
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = BeautifulSoup(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
for tr in table1.find_all('tr'):
all_cells = tr.find_all('td')
track = all_cells.a.href.get_text()
href = all_cells.get('href')
trackresults = base1_url + href
This
yesterday1 = yesterday.strftime(format)
Is a string. That's why you are getting that error
I'm not able to filter the results of table[3] to only include rows that have today's date in them. I'm using this url as my data source:
http://tides.mobilegeographics.com/locations/3881.html
I can get all the data back, but my filtering isn't working. I get the entire range, 5 days back. I only want something like this: (current day)
Montauk Point, Long Island Sound, New York
41.0717° N, 71.8567° W
2014-03-13 12:37 PM EDT 0.13 feet Low Tide
2014-03-13 6:51 PM EDT Sunset
2014-03-13 7:13 PM EDT 2.30 feet High Tide
How can I get this and then calculate if the tide is moving in/out within next 40 minutes.
Thanks for helping.
My Code is:
import sre, urllib2, sys, BaseHTTPServer, datetime, re, time, pprint, smtplib
from bs4 import BeautifulSoup
from bs4.diagnose import diagnose
data = urllib2.urlopen('http://tides.mobilegeographics.com/locations/3881.html').read()
day = datetime.date.today().day
month = datetime.date.today().month
year = datetime.date.today().year
date = datetime.date.today()
soup = BeautifulSoup(data)
keyinfo = soup.find_all('h2')
str_date = datetime.date.today().strftime("%Y-%m-%d")
time_text = datetime.datetime.now() + datetime.timedelta(minutes = 20)
t_day = time_text.strftime("%Y-%m-%d")
tide_table = soup.find_all('table')[3]
pre = tide_table.findAll('pre')
dailytide = []
pattern = str_date
allmatches = re.findall(r'pattern', pre)
print allmatches
if allmatches:
print allmatches
else:
print "Match for " + str_date + " not found in data string \n" + datah
You don't need a regular expression, just split the contents of a pre tag and check if today's date is in the line:
import urllib2
import datetime
from bs4 import BeautifulSoup
URL = 'http://tides.mobilegeographics.com/locations/3881.html'
soup = BeautifulSoup(urllib2.urlopen(URL))
pre = soup.find_all('table')[3].find('pre').text
today = datetime.date.today().strftime("%Y-%m-%d")
for line in pre.split('\n'):
if today in line:
print line
prints:
2014-03-13 6:52 PM EDT Sunset
2014-03-13 7:13 PM EDT 2.30 feet High Tide