Code to download data. Choose a dataset, then a table, and get code to download it as Pandas or Numpy.

hub > nyt-covid > us_states

R

R code Copy

# RImports
library("glue")
library("reticulate")
library("rjson")

download_data_from_github_location <- '
import os
import urllib.request
def download_data_from_github_location(organization, repository, branch,
    path, tmp_dir, dataset_name, dest_file_name, table_name):
    os.makedirs(tmp_dir + \'/\' + dataset_name + \'/\' + table_name, exist_ok=True)
    clone_url = (
        \'https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}\'
        .format(org=organization, repo=repository, branch=branch, filename=
        path))
    print(\'Downloading data from: %s\' % clone_url)
    urllib.request.urlretrieve(clone_url, os.path.join(tmp_dir,
        dataset_name, table_name, dest_file_name))
'
read.csv.for.table <- function(static_data_table_name, dataset_name, 
    columns, tmp_dir) {
    columns <- fromJSON(columns)
    data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv", 
        tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
    colnames(data) <- columns
    sprintf("Downloaded dataframe with %d rows and %d columns.", 
        nrow(data), ncol(data))
    return(data)
}
# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
# 
download_location <- py_run_string(paste(download_data_from_github_location, 
    "download_data_from_github_location(repository='covid-19-data', branch=\n    'master', dataset_name='nyt-covid', path='us-states.csv', table_name=\n    'us_states', organization='nytimes', tmp_dir='/tmp/nyt-covid/us_states',\n    dest_file_name='us_states.csv')\n", 
    sep = "\n"))

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}", 
    n = "2", file_name = "us_states.csv", table_name = "us_states", 
    tmp_dir = "/tmp/nyt-covid/us_states"))

# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "us_states", 
    tmp_dir = "/tmp/nyt-covid/us_states", columns = "[\"date\",\"state\",\"fips\",\"cases\",\"deaths\"]", 
    dataset_name = "nyt-covid")

Pandas

requirements.txt Copy

- pandas==1.2.3
pip command Copy

pip3 install pandas==1.2.3
Pandas code Copy

# PythonImports
import json
import os
import pandas as pd
import subprocess
import urllib.request


def download_data_from_github_location(
    organization,
    repository,
    branch,
    path,
    tmp_dir,
    dataset_name,
    dest_file_name,
    table_name,
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    clone_url = "https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}".format(
        org=organization, repo=repository, branch=branch, filename=path
    )
    print("Downloading data from: %s" % clone_url)
    urllib.request.urlretrieve(
        clone_url,
        os.path.join(tmp_dir, dataset_name, table_name, dest_file_name),
    )


def read_csv_into_pandas(
    static_data_table_name, dataset_name, columns, tmp_dir
):
    columns = json.loads(columns)
    data = pd.read_csv(
        "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
            tmp_dir=tmp_dir,
            dataset_name=dataset_name,
            static_data_table_name=static_data_table_name,
        ),
        header=None,
    )
    data.columns = columns
    print("Downloaded dataframe with %d rows and %d columns." % data.shape)
    return data


# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
#
download_location = download_data_from_github_location(
    repository="covid-19-data",
    branch="master",
    dataset_name="nyt-covid",
    path="us-states.csv",
    table_name="us_states",
    organization="nytimes",
    tmp_dir="/tmp/nyt-covid/us_states",
    dest_file_name="us_states.csv",
)

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="2",
        file_name="us_states.csv",
        table_name="us_states",
        tmp_dir="/tmp/nyt-covid/us_states",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
    columns='["date","state","fips","cases","deaths"]',
    static_data_table_name="us_states",
    tmp_dir="/tmp/nyt-covid/us_states",
    dataset_name="nyt-covid",
)

Numpy

requirements.txt Copy

numpy==1.20.1
pip command Copy

pip3 install numpy==1.20.1
Numpy code Copy

# PythonImports
import csv
import json
import numpy as np
import os
import subprocess
import urllib.request


def download_data_from_github_location(
    organization,
    repository,
    branch,
    path,
    tmp_dir,
    dataset_name,
    dest_file_name,
    table_name,
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    clone_url = "https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}".format(
        org=organization, repo=repository, branch=branch, filename=path
    )
    print("Downloading data from: %s" % clone_url)
    urllib.request.urlretrieve(
        clone_url,
        os.path.join(tmp_dir, dataset_name, table_name, dest_file_name),
    )


def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
    columns = json.loads(columns)
    file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
        dataset_name=dataset_name,
        static_data_table_name=static_data_table_name,
        tmp_dir=tmp_dir,
    )
    escaped = (
        "\t".join([i.replace("\t", " ") for i in x])
        for x in csv.reader(open(file_name))
    )
    data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
    assert data is not None
    print("Downloaded ndarray with %d rows." % data.shape[0])
    return data


# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
#
download_location = download_data_from_github_location(
    repository="covid-19-data",
    branch="master",
    dataset_name="nyt-covid",
    path="us-states.csv",
    table_name="us_states",
    organization="nytimes",
    tmp_dir="/tmp/nyt-covid/us_states",
    dest_file_name="us_states.csv",
)

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="2",
        file_name="us_states.csv",
        table_name="us_states",
        tmp_dir="/tmp/nyt-covid/us_states",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
    dataset_name="nyt-covid",
    tmp_dir="/tmp/nyt-covid/us_states",
    columns='["date","state","fips","cases","deaths"]',
    static_data_table_name="us_states",
)

Schema

NameType
dateDateString
stateRegionName
fipsFIPSStateCode
casesNaturalNumber
deathsNaturalNumber

GithubLocation Storage

ParamValue
organizationnytimes
pathus-states.csv
repositorycovid-19-data