[Tips] サクッと MySQL と PostgreSQL と Redshiftに大量データを作成する方法

2019.05.24

この記事は公開されてから1年以上経過しています。情報が古い可能性がありますので、ご注意ください。

はじめに

前々から社内で書く書くって言ってた、サクッと大量データを作成する方法を紹介します。(これで書く書く詐欺って言われない♪)

大量データを作成の共通点

大量データを作成の流れは、大量データ用テーブルに自らの空レコードをコピーすることで大量のレコードを作成します。作成したいレコード数に達すると、一気に乱数を用いてレコードに値を設定します。今回の例では、以下のバリエーションのデータに対して値を設定しています。

  • オートインクリメントの主キーであるid
  • 可変長文字列であるnamedescription
  • 符号なしINTであるprice
  • フラグであるdelete_flag
  • 日時データであるcreated_atupdated_at

MySQL5.7 / Amazon Aurora(MySQL5.7互換) の場合

items テーブルのidカラムは、AUTO_INCREMENTを用いて自動採番します。

以下のクエリで、インスタンスタイプによりますが、t2.smallでも15分程度で16777216レコード生成します。レコード数の調整は、13〜36行のINSERT INTO items (id) SELECT 0 FROM items;の行数を増減することで調整します。

-- 大量データ用テーブル
CREATE TABLE items (
id INT PRIMARY KEY AUTO_INCREMENT,
name VARCHAR(16),
description VARCHAR(30),
price INT UNSIGNED,
delete_flag tinyint(4) NOT NULL DEFAULT '0',
created_at timestamp,
updated_at timestamp
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO items () VALUES ();
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;

-- 乱数を用いて値を設定する
UPDATE items SET
name = CONCAT('item', id),
description = SUBSTRING(MD5(RAND()), 1, 30),
price = CEIL(RAND() * 10000),
delete_flag = MOD((RAND() * 100), 1),
created_at = ADDTIME(CONCAT_WS(' ','2014-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401)))),
updated_at = ADDTIME(CONCAT_WS(' ','2015-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401))))
;

実際に実行した結果は、以下のとおりです。

mysql> CREATE TABLE items (
-> id INT PRIMARY KEY AUTO_INCREMENT,
-> name VARCHAR(16),
-> description VARCHAR(30),
-> price INT UNSIGNED,
-> delete_flag tinyint(4) NOT NULL DEFAULT '0',
-> created_at timestamp,
-> updated_at timestamp
-> );
Query OK, 0 rows affected (0.09 sec)

mysql> INSERT INTO items () VALUES ();
Query OK, 1 row affected (0.17 sec)

INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 1 row affected (0.15 sec)
Records: 1 Duplicates: 0 Warnings: 0

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 2 rows affected (0.17 sec)
Records: 2 Duplicates: 0 Warnings: 0
:
(中略)
:
mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 4194304 rows affected (17.10 sec)
Records: 4194304 Duplicates: 0 Warnings: 0

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 8388608 rows affected (34.62 sec)
Records: 8388608 Duplicates: 0 Warnings: 0

mysql> select count(*) from items;
+----------+
| count(*) |
+----------+
| 16777216 |
+----------+
1 row in set (5.13 sec)

mysql> UPDATE items SET
-> name = CONCAT('item', id),
-> description = SUBSTRING(MD5(RAND()), 1, 30),
-> price = CEIL(RAND() * 10000),
-> delete_flag = MOD((RAND() * 100), 1),
-> created_at = ADDTIME(CONCAT_WS(' ','2014-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401)))),
-> updated_at = ADDTIME(CONCAT_WS(' ','2015-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401))))
-> ;
Query OK, 16777216 rows affected (10 min 55.06 sec)
Rows matched: 16777216 Changed: 16777216 Warnings: 0

mysql> select * from items limit 3;
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
| id | name | description | price | delete_flag | created_at | updated_at |
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
| 1 | item1 | f7fe10e769f7c7faf0f90316b5ef1b | 5722 | 0 | 2014-11-05 02:04:49 | 2015-11-25 05:41:08 |
| 2 | item2 | 336368370a27aebdbdde7aa9b81b26 | 7210 | 1 | 2014-07-27 09:24:44 | 2015-04-07 03:21:14 |
| 3 | item3 | 1586507aa044dcffa98b6cf39e66ea | 1256 | 0 | 2014-02-17 22:28:20 | 2015-04-20 16:22:42 |
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
3 rows in set (0.07 sec)

PostgreSQL 9.6.9-R1 / Amazon Aurora(PostgreSQL 9.6.9互換) の場合

items テーブルのidカラムは、serial型を指定して自動的に作成したシーケンスを用いて自動採番します。

以下のクエリで、インスタンスタイプによりますが、r5.largeでも10分程度で16777216レコード生成します。レコード数の調整は、13〜36行のINSERT INTO items (name) select name from items;の行数を増減することで調整します。

-- 大量データ用テーブル
CREATE TABLE items (
id SERIAL,
name VARCHAR(16),
description VARCHAR(30),
price INT,
delete_flag boolean,
created_at timestamp,
updated_at timestamp,
PRIMARY KEY (id)
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO items (name) values(null);
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;

-- 乱数を用いて値を設定する
UPDATE items SET
name = CONCAT('item-', id),
description = SUBSTRING(md5(clock_timestamp()::text), 1, 30)::varchar,
price = CEIL(random() * 10000),
delete_flag = mod((random() * 100)::int,2)::boolean,
created_at = to_date('2016-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD'),
updated_at = to_date('2017-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD')
;

実際に実行した結果は、以下のとおりです。

postgresql969=> CREATE TABLE items (
postgresql969(> id SERIAL,
postgresql969(> name VARCHAR(16),
postgresql969(> description VARCHAR(30),
postgresql969(> price INT,
postgresql969(> delete_flag boolean,
postgresql969(> created_at timestamp,
postgresql969(> updated_at timestamp,
postgresql969(> PRIMARY KEY (id)
postgresql969(> );
CREATE TABLE

postgresql969=> INSERT INTO items (name) values(null);
INSERT 0 1
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 1
:
(中略)
:
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 4194304
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 8388608

postgresql969=> UPDATE items SET
postgresql969-> name = CONCAT('item-', id),
postgresql969-> description = SUBSTRING(md5(clock_timestamp()::text), 1, 30)::varchar,
postgresql969-> price = CEIL(random() * 10000),
postgresql969-> delete_flag = mod((random() * 100)::int,2)::boolean,
postgresql969-> created_at = to_date('2016-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD'),
postgresql969-> updated_at = to_date('2017-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD')
postgresql969-> ;
UPDATE 16777216

postgresql969=> select * from items limit 3;
id | name | description | price | delete_flag | created_at | updated_at
----+--------+--------------------------------+-------+-------------+---------------------+---------------------
1 | item-1 | 6e99262491738ca5c3e89301772497 | 441 | t | 2016-01-11 00:00:00 | 2017-02-08 00:00:00
2 | item-2 | f9901c57a47bf1b30b9da656f1f82a | 4734 | f | 2016-11-26 00:00:00 | 2017-06-15 00:00:00
3 | item-3 | 2ce993faf3265c177802e6576ff6e2 | 5755 | t | 2016-10-10 00:00:00 | 2017-05-13 00:00:00
(3 rows)

Amazon Redshift の場合

items テーブルのidカラムは、IDENTITY(1,1)を指定して自動採番します。

以下のクエリで、インスタンスタイプによりますが、dc2.largeの2ノードクラスタ構成で1分程度で16777216レコード生成します。レコード数の調整は、30〜53行のINSERT INTO temp (name) select name from temp;の行数を増減することで調整します。

PostgreSQLとの相違点は、大量データ作成用一時テーブルにデータを作成した後、大量データ用テーブルにディープコピーしています。ディープコピーしている理由は、データをソートした状態で格納するためです。

-- 大量データ用テーブル
CREATE TABLE items (
id INT,
name VARCHAR(16),
description VARCHAR(30),
price INT,
delete_flag boolean,
created_at timestamp,
updated_at timestamp,
PRIMARY KEY (id)
)
DISTSTYLE EVEN
SORTKEY(id)
;

-- 大量データ作成用一時テーブル
DROP TABLE temp;
CREATE TEMP TABLE temp (
id INT PRIMARY KEY IDENTITY(1,1),
name VARCHAR(16),
description VARCHAR(30),
price INT,
delete_flag boolean,
created_at timestamp,
updated_at timestamp
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO temp (name) values(null);
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;

-- 乱数を用いて値を設定する
UPDATE temp SET
name = CONCAT('item-', id),
description = SUBSTRING(MD5(random()), 1, 30)::varchar,
price = CEIL(random() * 10000),
delete_flag = mod((random() * 100)::int,2),
created_at = dateadd(s,ceil(random() * 31536000)::int,'2016-01-01'),
updated_at = dateadd(s,ceil(random() * 31536000)::int,'2017-01-01')
;

-- 大量データ用テーブルにディープコピー(データのソート)
INSERT INTO items SELECT * FROM temp;

実際に実行した結果は、以下のとおりです。

cmdb=# CREATE TABLE items (
cmdb(# id INT,
cmdb(# name VARCHAR(16),
cmdb(# description VARCHAR(30),
cmdb(# price INT,
cmdb(# delete_flag boolean,
cmdb(# created_at timestamp,
cmdb(# updated_at timestamp,
cmdb(# PRIMARY KEY (id)
cmdb(# )
cmdb-# DISTSTYLE EVEN
cmdb-# SORTKEY(id)
cmdb-# ;
CREATE TABLE

cmdb=# CREATE TEMP TABLE temp (
cmdb(# id INT PRIMARY KEY IDENTITY(1,1),
cmdb(# name VARCHAR(16),
cmdb(# description VARCHAR(30),
cmdb(# price INT,
cmdb(# delete_flag boolean,
cmdb(# created_at timestamp,
cmdb(# updated_at timestamp
cmdb(# );
CREATE TABLE

cmdb=# INSERT INTO temp (name) values(null);
INSERT 0 1
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 1
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 2
:
(中略)
:
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 4194304
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 8388608

cmdb=# UPDATE temp SET
cmdb-# name = CONCAT('item-', id),
cmdb-# description = SUBSTRING(MD5(random()), 1, 30)::varchar,
cmdb-# price = CEIL(random() * 10000),
cmdb-# delete_flag = mod((random() * 100)::int,2),
cmdb-# created_at = dateadd(s,ceil(random() * 31536000)::int,'2016-01-01'),
cmdb-# updated_at = dateadd(s,ceil(random() * 31536000)::int,'2017-01-01')
cmdb-# ;
UPDATE 16777216

cmdb=# INSERT INTO items SELECT * FROM temp;
INSERT 0 16777216

cmdb=# select * from items limit 3;
id | name | description | price | delete_flag | created_at | updated_at
----+---------+--------------------------------+-------+-------------+---------------------+---------------------
2 | item-2 | e96dbd1000cbc0b6d8f89595bd3328 | 7756 | f | 2016-04-23 22:32:24 | 2017-05-02 18:07:17
3 | item-3 | 49104498585009fe27b1c27735cc58 | 2797 | t | 2016-08-18 22:32:26 | 2017-04-15 22:25:45
14 | item-14 | ea0fcb62d24241af8ac711161a083a | 6695 | f | 2016-12-16 14:24:12 | 2017-06-07 18:03:41
(3 rows)

最後に

もっと良いやり方があるかもしれませんが、私なりのやり方を紹介しました。ザックリとストレージサイズを試算したり、クエリのパフォーマンス検証、DMSのリプリケーション動作の検証などなど、様々な用途に利用できるはずです。