From dd14ee2f2586ba2544d5e616d0bf035d2fc89e93 Mon Sep 17 00:00:00 2001 From: Jeff Wu Date: Thu, 2 May 2019 20:06:40 -0700 Subject: [PATCH] first commit, readme and download --- .gitignore | 2 ++ README.md | 41 ++++++++++++++++++++++++++++++++++++++++- download_dataset.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 download_dataset.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c8c3210 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.mypy_cache/ +data/ diff --git a/README.md b/README.md index 119c50a..7378467 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,41 @@ # gpt-2-output-dataset -Dataset of GPT-2 outputs for research in detection, biases, and more + +This dataset contains: +- 250K samples from the WebText test set +- For each GPT-2 model (trained on the WebText training set), 250K plain samples (temperature 1, no truncation) and 250K samples generated with top-k 40 truncation + +We look forward to the research produced using this data! + +### Download + +For each, we have a training split of 250K samples, as well as validation and test splits of 5K samples. + +For each model, we're releasing temperature 1 samples + +All data is located in Google Cloud Storage, at under the directory `gs://gpt-2/output-dataset/v1`. + +There, you will find files: + +- `webtext.${split}.jsonl` +- `small-117M.${split}.jsonl` +- `small-117M-k40.${split}.jsonl` +- `medium-345M.${split}.jsonl` +- `medium-345M-k40.${split}.jsonl` +- `large-762M.${split}.jsonl` +- `large-762M-k40.${split}.jsonl` +- `xl-1542M.${split}.jsonl` +- `xl-1542M-k40.${split}.jsonl` + +where split is one of `train`, `test`, and `valid`. + +We've provided a script to download all of them, in `download_dataset.py`. + +### Detectability baselines + +We're interested in seeing research in detectability of our model generations. + +We've provided a baseline of logistic regression on tf-idf, in `baseline.py`. + +### Data removal requests + +If you believe your work is included in our dataset and would like us to remove it, please let us know at webtextdata@openai.com. diff --git a/download_dataset.py b/download_dataset.py new file mode 100644 index 0000000..cb4147b --- /dev/null +++ b/download_dataset.py @@ -0,0 +1,29 @@ +import os +import sys +import requests +from tqdm import tqdm + +subdir = 'data' +if not os.path.exists(subdir): + os.makedirs(subdir) +subdir = subdir.replace('\\','/') # needed for Windows + +for ds in [ + 'webtext', + 'small-117M', 'small-117M-k40', + 'medium-345M', 'medium-345M-k40', + 'large-762M', 'large-762M-k40', + 'xl-1542M', 'xl-1542M-k40', +]: + for split in ['train', 'valid', 'test']: + filename = ds + "." + split + '.jsonl' + r = requests.get("https://storage.googleapis.com/gpt-2/output-dataset/v1/" + filename, stream=True) + + with open(os.path.join(subdir, filename), 'wb') as f: + file_size = int(r.headers["content-length"]) + chunk_size = 1000 + with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: + # 1k for chunk_size, since Ethernet packet size is around 1500 bytes + for chunk in r.iter_content(chunk_size=chunk_size): + f.write(chunk) + pbar.update(chunk_size)