R
R code Copy
# RImports
library("glue")
library("reticulate")
library("rjson")
download.data.from.remote.web.location <- function(src_url, tmp_dir,
dest_file_name, table_name, dataset_name) {
path <- paste(tmp_dir, dataset_name, table_name, sep = "/")
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}
dest <- paste(path, dest_file_name, sep = "/")
download.file(src_url, dest)
}
tsv_to_csv <- '
import os.path
def tsv_to_csv(tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
):
source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
with open(source) as sourcef, open(dest, \'w\') as destf:
while True:
chunk = sourcef.read(10240)
if not chunk:
destf.flush()
destf.close()
break
destf.write(chunk.replace(\'\t\', \',\'))
'
read.csv.for.table <- function(static_data_table_name, dataset_name,
columns, tmp_dir) {
columns <- fromJSON(columns)
data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv",
tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
colnames(data) <- columns
sprintf("Downloaded dataframe with %d rows and %d columns.",
nrow(data), ncol(data))
return(data)
}
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location <- download.data.from.remote.web.location(table_name = "p2p_gnutella04",
dataset_name = "snap", src_url = "https://snap.stanford.edu/data/p2p-Gnutella04.txt.gz",
dest_file_name = "p2p_gnutella04.downloaded", tmp_dir = "/tmp/snap/p2p_gnutella04")
# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
decompress_gzip <- system(glue("gunzip {command}", command = "--suffix=downloaded -c /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded > /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp && mv /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded"))
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}",
n = "5", file_name = "p2p_gnutella04.downloaded", table_name = "p2p_gnutella04",
tmp_dir = "/tmp/snap/p2p_gnutella04"))
# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv <- py_run_string(paste(tsv_to_csv, "tsv_to_csv(table_name='p2p_gnutella04', src_file_name=\n 'p2p_gnutella04.downloaded', tmp_dir='/tmp/snap/p2p_gnutella04',\n dataset_name='snap', dest_file_name='p2p_gnutella04.csv')\n",
sep = "\n"))
# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "p2p_gnutella04",
tmp_dir = "/tmp/snap/p2p_gnutella04", columns = "[\"from_id\",\"to_id\"]",
dataset_name = "snap")
Pandas
requirements.txt Copy
- pandas==1.2.3
pip command Copy
pip3 install pandas==1.2.3
Pandas code Copy
# PythonImports
import json
import os
import os.path
import pandas as pd
import subprocess
import urllib.request
def download_data_from_remote_web_location(
src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
urllib.request.urlretrieve(
src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
)
def tsv_to_csv(
tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
):
source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
with open(source) as sourcef, open(dest, "w") as destf:
while True:
chunk = sourcef.read(10240)
if not chunk:
destf.flush()
destf.close()
break
destf.write(chunk.replace("\t", ","))
def read_csv_into_pandas(
static_data_table_name, dataset_name, columns, tmp_dir
):
columns = json.loads(columns)
data = pd.read_csv(
"{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
tmp_dir=tmp_dir,
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
),
header=None,
)
data.columns = columns
print("Downloaded dataframe with %d rows and %d columns." % data.shape)
return data
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
src_url="https://snap.stanford.edu/data/p2p-Gnutella04.txt.gz",
dest_file_name="p2p_gnutella04.downloaded",
dataset_name="snap",
table_name="p2p_gnutella04",
tmp_dir="/tmp/snap/p2p_gnutella04",
)
# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
process = subprocess.Popen(
"gunzip {command}".format(
command="--suffix=downloaded -c /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded > /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp && mv /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded"
),
stdout=subprocess.PIPE,
shell=True,
)
decompress_gzip, error = process.communicate()
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="5",
file_name="p2p_gnutella04.downloaded",
table_name="p2p_gnutella04",
tmp_dir="/tmp/snap/p2p_gnutella04",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv = tsv_to_csv(
table_name="p2p_gnutella04",
src_file_name="p2p_gnutella04.downloaded",
tmp_dir="/tmp/snap/p2p_gnutella04",
dataset_name="snap",
dest_file_name="p2p_gnutella04.csv",
)
# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
columns='["from_id","to_id"]',
static_data_table_name="p2p_gnutella04",
tmp_dir="/tmp/snap/p2p_gnutella04",
dataset_name="snap",
)
Numpy
requirements.txt Copy
numpy==1.20.1
pip command Copy
pip3 install numpy==1.20.1
Numpy code Copy
# PythonImports
import csv
import json
import numpy as np
import os
import os.path
import subprocess
import urllib.request
def download_data_from_remote_web_location(
src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
urllib.request.urlretrieve(
src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
)
def tsv_to_csv(
tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
):
source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
with open(source) as sourcef, open(dest, "w") as destf:
while True:
chunk = sourcef.read(10240)
if not chunk:
destf.flush()
destf.close()
break
destf.write(chunk.replace("\t", ","))
def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
columns = json.loads(columns)
file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
tmp_dir=tmp_dir,
)
escaped = (
"\t".join([i.replace("\t", " ") for i in x])
for x in csv.reader(open(file_name))
)
data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
assert data is not None
print("Downloaded ndarray with %d rows." % data.shape[0])
return data
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
src_url="https://snap.stanford.edu/data/p2p-Gnutella04.txt.gz",
dest_file_name="p2p_gnutella04.downloaded",
dataset_name="snap",
table_name="p2p_gnutella04",
tmp_dir="/tmp/snap/p2p_gnutella04",
)
# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
process = subprocess.Popen(
"gunzip {command}".format(
command="--suffix=downloaded -c /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded > /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp && mv /tmp/snap/p2p_gnutella04/p2p_gnutella04.tmp /tmp/snap/p2p_gnutella04/p2p_gnutella04.downloaded"
),
stdout=subprocess.PIPE,
shell=True,
)
decompress_gzip, error = process.communicate()
# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
"tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
n="5",
file_name="p2p_gnutella04.downloaded",
table_name="p2p_gnutella04",
tmp_dir="/tmp/snap/p2p_gnutella04",
),
stdout=subprocess.PIPE,
shell=True,
)
remove_header, error = process.communicate()
# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv = tsv_to_csv(
table_name="p2p_gnutella04",
src_file_name="p2p_gnutella04.downloaded",
tmp_dir="/tmp/snap/p2p_gnutella04",
dataset_name="snap",
dest_file_name="p2p_gnutella04.csv",
)
# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
dataset_name="snap",
tmp_dir="/tmp/snap/p2p_gnutella04",
columns='["from_id","to_id"]',
static_data_table_name="p2p_gnutella04",
)
Schema
Name | Type |
---|---|
from_id | NumericIdentifier |
to_id | NumericIdentifier |
WebLocation Storage
Param | Value |
---|---|
address | https://snap.stanford.edu/data/p2p-Gnutella04.txt.gz |