Need to get the first image link with Python - python

I need to get the link of the first photo from the link "https://www.balticshipping.com/vessel/imo/9127382" using Python.
I am testing with the BeautifullSoup library but there is no way to get it. From what I see, the image is not in JPG or PNG format, therefore, it does not detect it.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('https://www.balticshipping.com/vessel/imo/9127382')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.png')})
for image in images:
print(image['src']+'\n')
Does anyone have any ideas how to do it?
Full Loop Code: ("s" contains many ships data (IMO, date, shipname...))
def create_geojson_features(s):
features = []
for _, row in s.iterrows():
vessel_id = row['IMO']
data = {
"templates[]": [
"modal_validation_errors:0",
"modal_email_verificate:0",
"r_vessel_types_multi:0",
"r_positions_single:0",
"vessel_profile:0",
],
"request[0][module]": "ships",
"request[0][action]": "list",
"request[0][id]": "0",
"request[0][data][0][name]": "imo",
"request[0][data][0][value]": vessel_id,
"request[0][sort]": "",
"request[0][limit]": "1",
"request[0][stamp]": "0",
"request[1][module]": "top_stat",
"request[1][action]": "list",
"request[1][id]": "0",
"request[1][data]": "",
"request[1][sort]": "",
"request[1][limit]": "",
"request[1][stamp]": "0",
"dictionary[]": ["countrys:0", "vessel_types:0", "positions:0"],
}
data = requests.post("https://www.balticshipping.com/", data=data).json()
image = data["data"]["request"][0]["ships"][0]["data"]["gallery"][0]["file"]
print(image)
feature = {
'type': 'Feature',
'geometry': {
'type':'Point',
'coordinates':[row['lon'],row['lat']]
},
'properties': {
'time': pd.to_datetime(row['date']).__str__(),
'popup': "<img src=" + image.__str__() + " width = '250' height='200'/>"+'<br>'+'<br>'+'Shipname: '+row['shipname'].__str__() +'<br>'+ 'MMSI: '+row['mmsi'].__str__() +'<br>' + 'Group: '+row['group'].__str__() +'<br>''Speed: '+row['speed'].__str__()+' knots',
'style': {'color' : ''},
'icon': 'circle',
'iconstyle':{
'fillColor': row['fillColor'],
'fillOpacity': 0.8,
'radius': 5
}
}
}
features.append(feature)
return features

The data you see are loaded via Ajax from external source. You can use this example how to get the picture URLs:
import json
import requests
url = "https://www.balticshipping.com/vessel/imo/9127382"
vessel_id = url.split("/")[-1]
data = {
"templates[]": [
"modal_validation_errors:0",
"modal_email_verificate:0",
"r_vessel_types_multi:0",
"r_positions_single:0",
"vessel_profile:0",
],
"request[0][module]": "ships",
"request[0][action]": "list",
"request[0][id]": "0",
"request[0][data][0][name]": "imo",
"request[0][data][0][value]": vessel_id,
"request[0][sort]": "",
"request[0][limit]": "1",
"request[0][stamp]": "0",
"request[1][module]": "top_stat",
"request[1][action]": "list",
"request[1][id]": "0",
"request[1][data]": "",
"request[1][sort]": "",
"request[1][limit]": "",
"request[1][stamp]": "0",
"dictionary[]": ["countrys:0", "vessel_types:0", "positions:0"],
}
data = requests.post("https://www.balticshipping.com/", data=data).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
for g in data["data"]["request"][0]["ships"][0]["data"]["gallery"]:
print(g["file"])
Prints:
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2948097
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2864147
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2830344
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2674783
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2521379
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2083722
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=2083721
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1599301
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1464102
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1464099
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1464093
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1464089
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=1110349
https://photos.marinetraffic.com/ais/showphoto.aspx?photoid=433106

Related

Why does this print an empty list and dictionary?

import requests
from bs4 import BeautifulSoup
import json
data = {
0:{
0:"title",
1:"dates",
2:"city/state",
3:"country"
},
1:{
0:"event",
1:"reps",
2:"prize"
},
2:{
0:"results"
}
}
url = "https://mms.kcbs.us/members/evr_search.php?org_id=KCBA"
response = requests.get(url).text
soup = BeautifulSoup(response, features='lxml')
all_data = []
for element in soup.find_all('div', class_="row"):
event = {}
for i, col in enumerate(element.find_all('div', class_='col-md-4')):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data,indent=4))
heres a link to the website https://mms.kcbs.us/members/evr_search.php?org_id=KCBA
Im unsure why nothing gets added to the list and dictionaries
The data you see is loaded from external URL via JavaScript. To simulate the Ajax request you can use next example:
import json
import requests
from bs4 import BeautifulSoup
api_url = "https://mms.kcbs.us/members/evr_search_ol_json.php"
params = {
"otype": "TEXT",
"evr_map_type": "2",
"org_id": "KCBA",
"evr_begin": "6/16/2022",
"evr_end": "7/16/2022",
"evr_address": "",
"evr_radius": "50",
"evr_type": "269",
"evr_openings": "0",
"evr_region": "",
"evr_region_type": "1",
"evr_judge": "0",
"evr_keyword": "",
"evr_rep_name": "",
}
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
data = {
0: {0: "title", 1: "dates", 2: "city/state", 3: "country"},
1: {0: "event", 1: "reps", 2: "prize"},
2: {0: "results"},
}
all_data = []
for element in soup.find_all("div", class_="row"):
event = {}
for i, col in enumerate(element.find_all("div", class_="col-md-4")):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data, indent=4))
Prints:
[
{
"title": "Frisco BBQ Challenge",
"dates": "6/16/2022 - 6/18/2022",
"city/state": "Frisco, CO 80443",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: BUNNY TUTTLE, RICH TUTTLE, MICHAEL WINTER",
"prize": "Prize Money: $13,050.00",
"results": "Results Not In"
},
{
"title": "York County BBQ Festival",
"dates": "6/17/2022 - 6/18/2022",
"city/state": "Delta, PA 17314",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: ANGELA MCKEE, ROBERT MCKEE, LOUISE WEIDNER",
"prize": "Prize Money: $5,500.00",
"results": "Results Not In"
},
...and so on.

Sending GET request to URL with fragment returns the content of the main page

I am trying to web-scrap this webpage but I always end up getting the "main" page (same URL but without "#face-a-face" at the end). It's the same problem as this guy encountered, see this forum. He got an answer but I am not able to generalize and apply this for the website I want to scrap.
import requests
from bs4 import BeautifulSoup
url_main = "https://www.lequipe.fr/Football/match-direct/ligue-1/2020-2021/ol-dijon-live/477168"
url_target = url_main + "#face-a-face"
soup_main = BeautifulSoup(requests.get(url_main, verify=False).content, "html.parser")
soup_target = BeautifulSoup(requests.get(url_target, verify=False).content, "html.parser")
print(soup_main == soup_target)
returns True. I would like to get different contents, this is not the case here.
For example, I would like to extract all the "confrontations depuis 2011" in the target webpage. How can I get the final content of this webpage with a GET request (or with another way) ? Thanks !
All the data comes from a highly nested JSON file.
You can get that file and extract the information you need.
Here's how:
import json
import requests
endpoint = "https://iphdata.lequipe.fr/iPhoneDatas/EFR/STD/ALL/V2/Football/Prelive/68/477168.json"
team_data = requests.get(endpoint).json()
specifics = team_data["items"][1]["objet"]["matches"][0]["specifics"]
print(json.dumps(specifics, indent=2))
This should get you a dictionary:
{
"__type": "specifics_sport_collectif",
"vainqueur": "domicile",
"score": {
"__type": "score",
"exterieur": "1",
"domicile": "4"
},
"exterieur": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "202",
"url_image": "https://medias.lequipe.fr/logo-football/202/{width}{declinaison}",
"nom": "Dijon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub202.html"
}
},
"domicile": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "22",
"url_image": "https://medias.lequipe.fr/logo-football/22/{width}{declinaison}",
"nom": "Lyon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub22.html"
}
},
"is_final": false,
"prolongation": false,
"vainqueur_final": "domicile",
"is_qualifier": false
}
And if you, for example, just want the socre, add this line:
just_the_score = specifics["score"]
print(just_the_score)
To get this:
{'__type': 'score', 'exterieur': '1', 'domicile': '4'}

How to convert python-request JSON results to csv?

I am trying to get my list of contacts from my WIX website using their API endpoint url and the requests module in python. I am totally stuck.
Here's my code so far:
import requests
auth_key = "my auth key"
r = requests.get("https://www.wixapis.com/crm/v1/contacts", headers={"Authorization": auth_key})
print(r.status_code)
dict = r.json()
contacts_list = dict["contacts"]
for i in contacts_list:
for key in i:
print(key, ':', i[key])
Here is what I get:
200
id : long id string 1
emails : [{'tag': 'UNTAGGED', 'email': 'sampleemail1#yahoo.com'}]
phones : []
addresses : [{'tag': 'UNTAGGED', 'countryCode': 'US'}]
metadata : {'createdAt': '2020-07-08T22:41:07.135Z', 'updatedAt': '2020-07-08T22:42:19.327Z'}
source : {'sourceType': 'SITE_MEMBERS'}
id : long id string 2
emails : [{'tag': 'UNTAGGED', 'email': 'sampleemail2#yahoo.com'}]
phones : []
addresses : []
metadata : {'createdAt': '2020-07-03T00:51:21.127Z', 'updatedAt': '2020-07-04T03:26:16.370Z'}
source : {'sourceType': 'SITE_MEMBERS'}
Process finished with exit code 0
Each line is a string. I need each row of the csv to be a new contact (There are two sample contacts). The columns should be the keys. I plan to use the csv module to writerow(Fields), where fields is a list of string (keys) such as Fields = [id, emails, phones, addresses, metadata, source]
All I really need is the emails in a single column of a csv though. Is there a way to maybe just get the email for each contact?
A CSV file with one column is basically just a text file with one item per line, but you can use the csv module to do it if you really want, as shown below.
I commented-out the 'python-requests' stuff and used some sample input for testing.
test_data = {
"contacts": [
{
"id": "long id string 1",
"emails": [
{
"tag": "UNTAGGED",
"email": "sampleemail1#yahoo.com"
}
],
"phones": [],
"addresses": [
{
"tag": "UNTAGGED",
"countryCode": "US"
}
],
"metadata": {
"createdAt": "2020-07-08T22:41:07.135Z",
"updatedAt": "2020-07-08T22:42:19.327Z"
},
"source": {
"sourceType": "SITE_MEMBERS"
}
},
{
"id": "long id string 2",
"emails": [
{
"tag": "UNTAGGED",
"email": "sampleemail2#yahoo.com"
}
],
"phones": [],
"addresses": [],
"metadata": {
"createdAt": "2020-07-03T00:51:21.127Z",
"updatedAt": "2020-07-04T03:26:16.370Z"
},
"source": {
"sourceType": "SITE_MEMBERS"
}
}
]
}
import csv
import json
import requests
auth_key = "my auth key"
output_filename = 'whatever.csv'
#r = requests.get("https://www.wixapis.com/crm/v1/contacts", headers={"Authorization": auth_key})
#print(r.status_code)
#json_obj = r.json()
json_obj = test_data # FOR TESTING PURPOSES
contacts_list = json_obj["contacts"]
with open(output_filename, 'w', newline='') as outp:
writer = csv.writer(outp)
writer.writerow(['email']) # Write csv header.
for contact in contacts_list:
email = contact['emails'][0]['email'] # Get the first one.
writer.writerow([email])
print('email csv file written')
Contents of whatever.csv file afterwards:
email
sampleemail1#yahoo.com
sampleemail2#yahoo.com
Update:
As pointed by #martineau, I just saw you can array in few values, you need to cater it. You may make them string with [].join() in the for loop
you can write it to csv like this using csv package.
import csv, json, sys
auth_key = "my auth key"
r = requests.get("https://www.wixapis.com/crm/v1/contacts", headers={"Authorization": auth_key})
print(r.status_code)
dict = r.json()
contacts_list = dict["contacts"]
output = csv.writer(sys.stdout)
#insert header(keys)
output.writerow(data[0].keys())
for i in contacts_list:
output.writerow(i.values())
At the end you can print and verify output

How to Scrape <Script> tag with Beautiful Soup BS4 (Python)

If you do View Page source on the below link;
https://www.zoopla.co.uk/for-sale/details/53818653?search_identifier=7e57533214fc2402ba53dd6c14b624f8
Line 89 has the tag <script> with information under it up to line 164. I am trying to extract this with beautiful soup but am unable to.
I can successfully extract other tags like "h2"/"Div" etc using the below:
From line 1,028 of the page source.
for item_name in soup.findAll('h2', {'class': 'ui-property-summary__address'}):
ad = item_name.get_text(strip=True)"
Can you please advise how I can extract the Script tag from line 89?
Thanks
This example will locate the <script> tag and parse some data from it:
import re
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.zoopla.co.uk/for-sale/details/53818653?search_identifier=7e57533214fc2402ba53dd6c14b624f8'
# locate the tag
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
script = soup.select_one('script:contains("ZPG.trackData.taxonomy")')
# parse some data from script
data1 = re.findall(r'ZPG\.trackData\.ecommerce = ({.*?});', script.text, flags=re.S)[0]
data1 = json.loads( re.sub(r'([^"\s]+):\s', r'"\1": ', data1) )
data2 = re.findall(r'ZPG\.trackData\.taxonomy = ({.*?});', script.text, flags=re.S)[0]
data2 = json.loads( re.sub(r'([^"\s]+):\s', r'"\1": ', data2) )
# print the data
print(json.dumps(data1, indent=4))
print(json.dumps(data2, indent=4))
Prints:
{
"detail": {
"products": [
{
"brand": "Walton and Allen Estate Agents Ltd",
"category": "for-sale/resi/agent/pre-owned/gb",
"id": 53818653,
"name": "FS_Contact",
"price": 1,
"quantity": 1,
"variant": "standard"
}
]
}
}
{
"signed_in_status": "signed out",
"acorn": 44,
"acorn_type": 44,
"area_name": "Aspley, Nottingham",
"beds_max": 3,
"beds_min": 3,
"branch_id": "43168",
"branch_logo_url": "https://st.zoocdn.com/zoopla_static_agent_logo_(586192).png",
"branch_name": "Walton & Allen Estate Agents",
"brand_name": "Walton and Allen Estate Agents Ltd",
"chain_free": false,
"company_id": "21619",
"country_code": "gb",
"county_area_name": "Nottingham",
"currency_code": "GBP",
"display_address": "Melbourne Road, Aspley, Nottingham NG8",
"furnished_state": "",
"group_id": "",
"has_epc": false,
"has_floorplan": true,
"incode": "5HN",
"is_retirement_home": false,
"is_shared_ownership": false,
"listing_condition": "pre-owned",
"listing_id": 53818653,
"listing_status": "for_sale",
"listings_category": "residential",
"location": "Aspley",
"member_type": "agent",
"num_baths": 1,
"num_beds": 3,
"num_images": 15,
"num_recepts": 1,
"outcode": "NG8",
"post_town_name": "Nottingham",
"postal_area": "NG",
"price": 150000,
"price_actual": 150000,
"price_max": 150000,
"price_min": 150000,
"price_qualifier": "guide_price",
"property_highlight": "",
"property_type": "semi_detached",
"region_name": "East Midlands",
"section": "for-sale",
"size_sq_feet": "",
"tenure": "",
"zindex": "129806"
}
Find all the <script> tags, then search them for the one that contains ZPG.trackData.ecommerce.
ecommerce = None
for item in soup.findAll('script'):
if 'ZPG.trackData.ecommerce' in item.string:
ecommerce = item.string
break

How can I scrape the content of this specific website (cineatlas)?

I am trying to scrape the content of this particular website : https://www.cineatlas.com/
I tried scraping the date part as shown in the print screen :
I used this basic beautifulsoup code
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,'html.parser')
type(soup)
time = soup.find('ul',class_='slidee')
This is what I get instead of the list of elements
<ul class="slidee">
<!-- adding dates -->
</ul>
The site creates HTML elements dynamically from the Javascript content. You can get the JS content by using re for example:
import re
import json
import requests
from ast import literal_eval
url = 'https://www.cineatlas.com/'
html_data = requests.get(url).text
movieData = re.findall(r'movieData = ({.*?}), movieDataByReleaseDate', html_data, flags=re.DOTALL)[0]
movieData = re.sub(r'\s*/\*.*?\*/\s*', '', movieData) # remove comments
movieData = literal_eval(movieData) # in movieData you have now the information about the current movies
print(json.dumps(movieData, indent=4)) # print data to the screen
Prints:
{
"2019-08-06": [
{
"url": "fast--furious--hobbs--shaw",
"image-portrait": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603443098_891497ecc8b16b3a662ad8b036820ed1_500x735.jpg",
"image-landscape": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603421049_7c233477779f25725bf22aeaacba469a_700x259.jpg",
"title": "FAST & FURIOUS : HOBBS & SHAW",
"releaseDate": "2019-08-07",
"endpoint": "ST00000392",
"duration": "120 mins",
"rating": "Classification TOUT",
"director": "",
"actors": "",
"times": [
{
"time": "7:00pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8388?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
"attributes": [
{
"_id": "5d468c20f67cc430833a5a2b",
"shortName": "VF",
"description": "Version Fran\u00e7aise"
},
{
"_id": "5d468c20f67cc430833a5a2a",
"shortName": "3D",
"description": "3D"
}
]
},
{
"time": "9:50pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8389?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
... and so on.
lis = time.findChildren()
This returns a list of child nodes

Categories

Resources