R
R code Copy
# RImports
library("glue")
library("rjson")
download.data.from.remote.web.location <- function(src_url, tmp_dir,
dest_file_name, table_name, dataset_name) {
path <- paste(tmp_dir, dataset_name, table_name, sep = "/")
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}
dest <- paste(path, dest_file_name, sep = "/")
download.file(src_url, dest)
}
read.csv.for.table <- function(static_data_table_name, dataset_name,
columns, tmp_dir) {
columns <- fromJSON(columns)
data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv",
tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
colnames(data) <- columns
sprintf("Downloaded dataframe with %d rows and %d columns.",
nrow(data), ncol(data))
return(data)
}
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location <- download.data.from.remote.web.location(table_name = "azure-covid",
dataset_name = "azure-covid", src_url = "https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv",
dest_file_name = "azure-covid.csv", tmp_dir = "/tmp/azure-covid/azure-covid")
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}",
n = "2", file_name = "azure-covid.csv", table_name = "azure-covid",
tmp_dir = "/tmp/azure-covid/azure-covid"))
# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "azure-covid",
tmp_dir = "/tmp/azure-covid/azure-covid", columns = "[\"date\",\"state\",\"positive\",\"negative\",\"pending\",\"hospitalized_currently\",\"hospitalized_cumulative\",\"in_icu_currently\",\"in_icu_cumulative\",\"on_ventilator_currently\",\"on_ventilator_cumultive\",\"recovered\",\"data_quality_grade\",\"last_update_et\",\"hash\",\"date_checked\",\"death\",\"hospitalized\",\"total\",\"total_test_results\",\"pos_neg\",\"fips\",\"death_increase\",\"hospitalized_increase\",\"negative_increase\",\"positive_increase\",\"total_test_results_increase\",\"fips_code\",\"iso_subdivision\",\"load_time\",\"iso_country\"]",
dataset_name = "azure-covid")
Pandas
requirements.txt Copy
- pandas==1.2.3
pip command Copy
pip3 install pandas==1.2.3
Pandas code Copy
# PythonImports
import json
import os
import pandas as pd
import subprocess
import urllib.request
def download_data_from_remote_web_location(
src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
urllib.request.urlretrieve(
src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
)
def read_csv_into_pandas(
static_data_table_name, dataset_name, columns, tmp_dir
):
columns = json.loads(columns)
data = pd.read_csv(
"{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
tmp_dir=tmp_dir,
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
),
header=None,
)
data.columns = columns
print("Downloaded dataframe with %d rows and %d columns." % data.shape)
return data
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
src_url="https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv",
dest_file_name="azure-covid.csv",
dataset_name="azure-covid",
table_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
)
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="2",
file_name="azure-covid.csv",
table_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
columns='["date","state","positive","negative","pending","hospitalized_currently","hospitalized_cumulative","in_icu_currently","in_icu_cumulative","on_ventilator_currently","on_ventilator_cumultive","recovered","data_quality_grade","last_update_et","hash","date_checked","death","hospitalized","total","total_test_results","pos_neg","fips","death_increase","hospitalized_increase","negative_increase","positive_increase","total_test_results_increase","fips_code","iso_subdivision","load_time","iso_country"]',
static_data_table_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
dataset_name="azure-covid",
)
Numpy
requirements.txt Copy
numpy==1.20.1
pip command Copy
pip3 install numpy==1.20.1
Numpy code Copy
# PythonImports
import csv
import json
import numpy as np
import os
import subprocess
import urllib.request
def download_data_from_remote_web_location(
src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
urllib.request.urlretrieve(
src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
)
def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
columns = json.loads(columns)
file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
tmp_dir=tmp_dir,
)
escaped = (
"\t".join([i.replace("\t", " ") for i in x])
for x in csv.reader(open(file_name))
)
data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
assert data is not None
print("Downloaded ndarray with %d rows." % data.shape[0])
return data
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
src_url="https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv",
dest_file_name="azure-covid.csv",
dataset_name="azure-covid",
table_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
)
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="2",
file_name="azure-covid.csv",
table_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
dataset_name="azure-covid",
tmp_dir="/tmp/azure-covid/azure-covid",
columns='["date","state","positive","negative","pending","hospitalized_currently","hospitalized_cumulative","in_icu_currently","in_icu_cumulative","on_ventilator_currently","on_ventilator_cumultive","recovered","data_quality_grade","last_update_et","hash","date_checked","death","hospitalized","total","total_test_results","pos_neg","fips","death_increase","hospitalized_increase","negative_increase","positive_increase","total_test_results_increase","fips_code","iso_subdivision","load_time","iso_country"]',
static_data_table_name="azure-covid",
)
Schema
Name | Type |
---|---|
date | DateString |
state | StringIdentifier |
positive | NaturalNumber |
negative | NaturalNumber |
pending | NaturalNumber |
hospitalized_currently | NaturalNumber |
hospitalized_cumulative | NaturalNumber |
in_icu_currently | NaturalNumber |
in_icu_cumulative | NaturalNumber |
on_ventilator_currently | NaturalNumber |
on_ventilator_cumultive | NaturalNumber |
recovered | NaturalNumber |
data_quality_grade | Empty |
last_update_et | DateString |
hash | FreeText |
date_checked | DateString |
death | NaturalNumber |
hospitalized | NaturalNumber |
total | NaturalNumber |
total_test_results | NaturalNumber |
pos_neg | NaturalNumber |
fips | FIPSStateCode |
death_increase | IntegerNumber |
hospitalized_increase | IntegerNumber |
negative_increase | IntegerNumber |
positive_increase | IntegerNumber |
total_test_results_increase | IntegerNumber |
fips_code | FIPSStateCode |
iso_subdivision | StringIdentifier |
load_time | DateString |
iso_country | StringIdentifier |
WebLocation Storage
Param | Value |
---|---|
address | https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv |