-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmisc_helper_functions.py
More file actions
161 lines (105 loc) · 3.71 KB
/
misc_helper_functions.py
File metadata and controls
161 lines (105 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
## Misc utilities
import pandas as pd
import os
from datetime import datetime, timedelta
#import wget
import numpy as np
import datetime
from pathlib import Path
from urllib.parse import urlsplit
## Read webpage
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
## Display pandas dataframe
from IPython.display import display, HTML
# Import packages needed to run GA code
#import datetime
import email.utils as eut
from io import BytesIO
import re
import zipfile
import urllib.request
import requests
import ssl
import shutil
ssl._create_default_https_context = ssl._create_unverified_context
# Source: https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
def find_all_links(url, search_string=None):
resp = requests.get(url)
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, from_encoding=encoding, features="lxml")
link_list = []
for link in soup.find_all('a', href=True):
link_list.append(link['href'])
if search_string:
return [x for x in link_list if search_string in x]
else:
return link_list
def download_file(file_url, new_file_name=None):
try:
try:
urllib.request.urlretrieve(file_url, new_file_name)
print('file download success!')
except:
r = requests.get(file_url)
with open(new_file_name, 'wb') as f:
f.write(r.content)
print('file download success!')
except:
print('file download failed!')
## Wrapper to unzip files
def unzip(path_to_zip_file, directory_to_extract_to='.'):
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
def url_to_soup(data_url):
"""
Converts string into beautiful soup object for parsing
Parameters
----------
data_url: string
website link
Returns
-------
data_soup: Beautifulsoup object
HMTL code from webpage
"""
data_page = requests.get(data_url)
if (data_page.status_code) == 200:
print('request successful')
else:
print('request failed for')
# Create a Beautiful Soup object
data_text = data_page.text
data_soup = BeautifulSoup(data_text, "html.parser")
# check to see a familiar HTML code
# print(data_soup.prettify()[:])
return data_soup
def get_json(url):
"""Simple function to return the parsed JSON from a web API."""
# The next two lines can raise a requests.RequestException
r = requests.get(url)
r.raise_for_status()
# The next line can raise a ValueError
return r.json()
def get_bytes(url, force_remote=False):
"""Get the content of a remote URL, caching in a local file."""
local_file = Path(Path(urlsplit(url).path).name)
if local_file.exists() and not force_remote:
with local_file.open('rb') as f:
return f.read()
r = requests.get(url)
r.raise_for_status()
with local_file.open('wb') as f:
f.write(r.content)
return r.content
def get_metadata_date(metadata_url):
"""For states using ESRI web services, the field metadata includes a timestamp.
This function fetches, extracts, and parses it, returning a datetime.date.
"""
metadata = get_json(metadata_url)
last_edit_ms = metadata['editingInfo']['lastEditDate']
# The next line can raise OverflowError
return datetime.date.fromtimestamp(last_edit_ms / 1000)