R
R code Copy
# RImports
library("glue")
library("reticulate")
library("rjson")
download_data_from_github_location <- '
import os
import urllib.request
def download_data_from_github_location(organization, repository, branch,
path, tmp_dir, dataset_name, dest_file_name, table_name):
os.makedirs(tmp_dir + \'/\' + dataset_name + \'/\' + table_name, exist_ok=True)
clone_url = (
\'https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}\'
.format(org=organization, repo=repository, branch=branch, filename=
path))
print(\'Downloading data from: %s\' % clone_url)
urllib.request.urlretrieve(clone_url, os.path.join(tmp_dir,
dataset_name, table_name, dest_file_name))
'
read.csv.for.table <- function(static_data_table_name, dataset_name,
columns, tmp_dir) {
columns <- fromJSON(columns)
data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv",
tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
colnames(data) <- columns
sprintf("Downloaded dataframe with %d rows and %d columns.",
nrow(data), ncol(data))
return(data)
}
# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
#
download_location <- py_run_string(paste(download_data_from_github_location,
"download_data_from_github_location(repository='covid-19-data', branch=\n 'master', dataset_name='nyt-covid', path='us-counties.csv', table_name=\n 'us_counties', organization='nytimes', tmp_dir=\n '/tmp/nyt-covid/us_counties', dest_file_name='us_counties.csv')\n",
sep = "\n"))
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}",
n = "2", file_name = "us_counties.csv", table_name = "us_counties",
tmp_dir = "/tmp/nyt-covid/us_counties"))
# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "us_counties",
tmp_dir = "/tmp/nyt-covid/us_counties", columns = "[\"date\",\"state\",\"county\",\"fips\",\"cases\",\"deaths\"]",
dataset_name = "nyt-covid")
Pandas
requirements.txt Copy
- pandas==1.2.3
pip command Copy
pip3 install pandas==1.2.3
Pandas code Copy
# PythonImports
import json
import os
import pandas as pd
import subprocess
import urllib.request
def download_data_from_github_location(
organization,
repository,
branch,
path,
tmp_dir,
dataset_name,
dest_file_name,
table_name,
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
clone_url = "https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}".format(
org=organization, repo=repository, branch=branch, filename=path
)
print("Downloading data from: %s" % clone_url)
urllib.request.urlretrieve(
clone_url,
os.path.join(tmp_dir, dataset_name, table_name, dest_file_name),
)
def read_csv_into_pandas(
static_data_table_name, dataset_name, columns, tmp_dir
):
columns = json.loads(columns)
data = pd.read_csv(
"{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
tmp_dir=tmp_dir,
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
),
header=None,
)
data.columns = columns
print("Downloaded dataframe with %d rows and %d columns." % data.shape)
return data
# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
#
download_location = download_data_from_github_location(
repository="covid-19-data",
branch="master",
dataset_name="nyt-covid",
path="us-counties.csv",
table_name="us_counties",
organization="nytimes",
tmp_dir="/tmp/nyt-covid/us_counties",
dest_file_name="us_counties.csv",
)
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="2",
file_name="us_counties.csv",
table_name="us_counties",
tmp_dir="/tmp/nyt-covid/us_counties",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
columns='["date","state","county","fips","cases","deaths"]',
static_data_table_name="us_counties",
tmp_dir="/tmp/nyt-covid/us_counties",
dataset_name="nyt-covid",
)
Numpy
requirements.txt Copy
numpy==1.20.1
pip command Copy
pip3 install numpy==1.20.1
Numpy code Copy
# PythonImports
import csv
import json
import numpy as np
import os
import subprocess
import urllib.request
def download_data_from_github_location(
organization,
repository,
branch,
path,
tmp_dir,
dataset_name,
dest_file_name,
table_name,
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
clone_url = "https://raw.githubusercontent.com/{org}/{repo}/{branch}/{filename}".format(
org=organization, repo=repository, branch=branch, filename=path
)
print("Downloading data from: %s" % clone_url)
urllib.request.urlretrieve(
clone_url,
os.path.join(tmp_dir, dataset_name, table_name, dest_file_name),
)
def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
columns = json.loads(columns)
file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
tmp_dir=tmp_dir,
)
escaped = (
"\t".join([i.replace("\t", " ") for i in x])
for x in csv.reader(open(file_name))
)
data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
assert data is not None
print("Downloaded ndarray with %d rows." % data.shape[0])
return data
# ## Downloading data from Github
# Data is stored in Github. We will only do a shallow clone of the file
# from the repo, without downloading the entire dataset.
#
download_location = download_data_from_github_location(
repository="covid-19-data",
branch="master",
dataset_name="nyt-covid",
path="us-counties.csv",
table_name="us_counties",
organization="nytimes",
tmp_dir="/tmp/nyt-covid/us_counties",
dest_file_name="us_counties.csv",
)
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="2",
file_name="us_counties.csv",
table_name="us_counties",
tmp_dir="/tmp/nyt-covid/us_counties",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
dataset_name="nyt-covid",
tmp_dir="/tmp/nyt-covid/us_counties",
columns='["date","state","county","fips","cases","deaths"]',
static_data_table_name="us_counties",
)
Schema
Name | Type |
---|---|
date | DateString |
state | RegionName |
county | RegionName |
fips | FIPSCountyCode |
cases | NaturalNumber |
deaths | NaturalNumber |
GithubLocation Storage
Param | Value |
---|---|
organization | nytimes |
path | us-counties.csv |
repository | covid-19-data |