Code to download data. Choose a dataset, then a table, and get code to download it as Pandas or Numpy.

hub > azure-covid > azure-covid

R

R code Copy

# RImports
library("glue")
library("rjson")

download.data.from.remote.web.location <- function(src_url, tmp_dir, 
    dest_file_name, table_name, dataset_name) {
    path <- paste(tmp_dir, dataset_name, table_name, sep = "/")
    if (!dir.exists(path)) {
        dir.create(path, recursive = TRUE)
    }
    dest <- paste(path, dest_file_name, sep = "/")
    download.file(src_url, dest)
}
read.csv.for.table <- function(static_data_table_name, dataset_name, 
    columns, tmp_dir) {
    columns <- fromJSON(columns)
    data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv", 
        tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
    colnames(data) <- columns
    sprintf("Downloaded dataframe with %d rows and %d columns.", 
        nrow(data), ncol(data))
    return(data)
}
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location <- download.data.from.remote.web.location(table_name = "azure-covid", 
    dataset_name = "azure-covid", src_url = "https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv", 
    dest_file_name = "azure-covid.csv", tmp_dir = "/tmp/azure-covid/azure-covid")

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}", 
    n = "2", file_name = "azure-covid.csv", table_name = "azure-covid", 
    tmp_dir = "/tmp/azure-covid/azure-covid"))

# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "azure-covid", 
    tmp_dir = "/tmp/azure-covid/azure-covid", columns = "[\"date\",\"state\",\"positive\",\"negative\",\"pending\",\"hospitalized_currently\",\"hospitalized_cumulative\",\"in_icu_currently\",\"in_icu_cumulative\",\"on_ventilator_currently\",\"on_ventilator_cumultive\",\"recovered\",\"data_quality_grade\",\"last_update_et\",\"hash\",\"date_checked\",\"death\",\"hospitalized\",\"total\",\"total_test_results\",\"pos_neg\",\"fips\",\"death_increase\",\"hospitalized_increase\",\"negative_increase\",\"positive_increase\",\"total_test_results_increase\",\"fips_code\",\"iso_subdivision\",\"load_time\",\"iso_country\"]", 
    dataset_name = "azure-covid")

Pandas

requirements.txt Copy

- pandas==1.2.3
pip command Copy

pip3 install pandas==1.2.3
Pandas code Copy

# PythonImports
import json
import os
import pandas as pd
import subprocess
import urllib.request


def download_data_from_remote_web_location(
    src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    urllib.request.urlretrieve(
        src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    )


def read_csv_into_pandas(
    static_data_table_name, dataset_name, columns, tmp_dir
):
    columns = json.loads(columns)
    data = pd.read_csv(
        "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
            tmp_dir=tmp_dir,
            dataset_name=dataset_name,
            static_data_table_name=static_data_table_name,
        ),
        header=None,
    )
    data.columns = columns
    print("Downloaded dataframe with %d rows and %d columns." % data.shape)
    return data


# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
    src_url="https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv",
    dest_file_name="azure-covid.csv",
    dataset_name="azure-covid",
    table_name="azure-covid",
    tmp_dir="/tmp/azure-covid/azure-covid",
)

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="2",
        file_name="azure-covid.csv",
        table_name="azure-covid",
        tmp_dir="/tmp/azure-covid/azure-covid",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
    columns='["date","state","positive","negative","pending","hospitalized_currently","hospitalized_cumulative","in_icu_currently","in_icu_cumulative","on_ventilator_currently","on_ventilator_cumultive","recovered","data_quality_grade","last_update_et","hash","date_checked","death","hospitalized","total","total_test_results","pos_neg","fips","death_increase","hospitalized_increase","negative_increase","positive_increase","total_test_results_increase","fips_code","iso_subdivision","load_time","iso_country"]',
    static_data_table_name="azure-covid",
    tmp_dir="/tmp/azure-covid/azure-covid",
    dataset_name="azure-covid",
)

Numpy

requirements.txt Copy

numpy==1.20.1
pip command Copy

pip3 install numpy==1.20.1
Numpy code Copy

# PythonImports
import csv
import json
import numpy as np
import os
import subprocess
import urllib.request


def download_data_from_remote_web_location(
    src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    urllib.request.urlretrieve(
        src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    )


def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
    columns = json.loads(columns)
    file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
        dataset_name=dataset_name,
        static_data_table_name=static_data_table_name,
        tmp_dir=tmp_dir,
    )
    escaped = (
        "\t".join([i.replace("\t", " ") for i in x])
        for x in csv.reader(open(file_name))
    )
    data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
    assert data is not None
    print("Downloaded ndarray with %d rows." % data.shape[0])
    return data


# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
    src_url="https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv",
    dest_file_name="azure-covid.csv",
    dataset_name="azure-covid",
    table_name="azure-covid",
    tmp_dir="/tmp/azure-covid/azure-covid",
)

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="2",
        file_name="azure-covid.csv",
        table_name="azure-covid",
        tmp_dir="/tmp/azure-covid/azure-covid",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
    dataset_name="azure-covid",
    tmp_dir="/tmp/azure-covid/azure-covid",
    columns='["date","state","positive","negative","pending","hospitalized_currently","hospitalized_cumulative","in_icu_currently","in_icu_cumulative","on_ventilator_currently","on_ventilator_cumultive","recovered","data_quality_grade","last_update_et","hash","date_checked","death","hospitalized","total","total_test_results","pos_neg","fips","death_increase","hospitalized_increase","negative_increase","positive_increase","total_test_results_increase","fips_code","iso_subdivision","load_time","iso_country"]',
    static_data_table_name="azure-covid",
)

Schema

NameType
dateDateString
stateStringIdentifier
positiveNaturalNumber
negativeNaturalNumber
pendingNaturalNumber
hospitalized_currentlyNaturalNumber
hospitalized_cumulativeNaturalNumber
in_icu_currentlyNaturalNumber
in_icu_cumulativeNaturalNumber
on_ventilator_currentlyNaturalNumber
on_ventilator_cumultiveNaturalNumber
recoveredNaturalNumber
data_quality_gradeEmpty
last_update_etDateString
hashFreeText
date_checkedDateString
deathNaturalNumber
hospitalizedNaturalNumber
totalNaturalNumber
total_test_resultsNaturalNumber
pos_negNaturalNumber
fipsFIPSStateCode
death_increaseIntegerNumber
hospitalized_increaseIntegerNumber
negative_increaseIntegerNumber
positive_increaseIntegerNumber
total_test_results_increaseIntegerNumber
fips_codeFIPSStateCode
iso_subdivisionStringIdentifier
load_timeDateString
iso_countryStringIdentifier

WebLocation Storage

ParamValue
addresshttps://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv