[Tips] サクッと MySQL と PostgreSQL と Redshiftに大量データを作成する方法

はじめに

前々から社内で書く書くって言ってた、サクッと大量データを作成する方法を紹介します。(これで書く書く詐欺って言われない♪)

大量データを作成の共通点

大量データを作成の流れは、大量データ用テーブルに自らの空レコードをコピーすることで大量のレコードを作成します。作成したいレコード数に達すると、一気に乱数を用いてレコードに値を設定します。今回の例では、以下のバリエーションのデータに対して値を設定しています。

  • オートインクリメントの主キーであるid
  • 可変長文字列であるnamedescription
  • 符号なしINTであるprice
  • フラグであるdelete_flag
  • 日時データであるcreated_atupdated_at

MySQL5.7 / Amazon Aurora(MySQL5.7互換) の場合

items テーブルのidカラムは、AUTO_INCREMENTを用いて自動採番します。

以下のクエリで、インスタンスタイプによりますが、t2.smallでも15分程度で16777216レコード生成します。レコード数の調整は、13〜36行のINSERT INTO items (id) SELECT 0 FROM items;の行数を増減することで調整します。

-- 大量データ用テーブル
CREATE TABLE items (
  id INT PRIMARY KEY AUTO_INCREMENT,
  name VARCHAR(16),
  description VARCHAR(30),
  price INT UNSIGNED,
  delete_flag tinyint(4) NOT NULL DEFAULT '0',
  created_at timestamp,
  updated_at timestamp
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO items () VALUES (); 
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;

-- 乱数を用いて値を設定する
UPDATE items SET
  name = CONCAT('item', id),
  description = SUBSTRING(MD5(RAND()), 1, 30),
  price = CEIL(RAND() * 10000),
  delete_flag = MOD((RAND() * 100), 1),
  created_at = ADDTIME(CONCAT_WS(' ','2014-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401)))),
  updated_at = ADDTIME(CONCAT_WS(' ','2015-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401))))
;

実際に実行した結果は、以下のとおりです。

mysql> CREATE TABLE items (
    ->   id INT PRIMARY KEY AUTO_INCREMENT,
    ->   name VARCHAR(16),
    ->   description VARCHAR(30),
    ->   price INT UNSIGNED,
    ->   delete_flag tinyint(4) NOT NULL DEFAULT '0',
    ->   created_at timestamp,
    ->   updated_at timestamp
    -> );
Query OK, 0 rows affected (0.09 sec)

mysql> INSERT INTO items () VALUES ();
Query OK, 1 row affected (0.17 sec)

INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;
INSERT INTO items (id) SELECT 0 FROM items;

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 1 row affected (0.15 sec)
Records: 1  Duplicates: 0  Warnings: 0

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 2 rows affected (0.17 sec)
Records: 2  Duplicates: 0  Warnings: 0
  :
 (中略)
  :
mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 4194304 rows affected (17.10 sec)
Records: 4194304  Duplicates: 0  Warnings: 0

mysql> INSERT INTO items (id) SELECT 0 FROM items;
Query OK, 8388608 rows affected (34.62 sec)
Records: 8388608  Duplicates: 0  Warnings: 0

mysql> select count(*) from items;
+----------+
| count(*) |
+----------+
| 16777216 |
+----------+
1 row in set (5.13 sec)

mysql> UPDATE items SET
    ->   name = CONCAT('item', id),
    ->   description = SUBSTRING(MD5(RAND()), 1, 30),
    ->   price = CEIL(RAND() * 10000),
    ->   delete_flag = MOD((RAND() * 100), 1),
    ->   created_at = ADDTIME(CONCAT_WS(' ','2014-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401)))),
    ->   updated_at = ADDTIME(CONCAT_WS(' ','2015-01-01' + INTERVAL RAND() * 365 DAY, '00:00:00'), SEC_TO_TIME(FLOOR(0 + (RAND() * 86401))))
    -> ;
Query OK, 16777216 rows affected (10 min 55.06 sec)
Rows matched: 16777216  Changed: 16777216  Warnings: 0

mysql> select * from items limit 3;
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
| id | name  | description                    | price | delete_flag | created_at          | updated_at          |
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
|  1 | item1 | f7fe10e769f7c7faf0f90316b5ef1b |  5722 |           0 | 2014-11-05 02:04:49 | 2015-11-25 05:41:08 |
|  2 | item2 | 336368370a27aebdbdde7aa9b81b26 |  7210 |           1 | 2014-07-27 09:24:44 | 2015-04-07 03:21:14 |
|  3 | item3 | 1586507aa044dcffa98b6cf39e66ea |  1256 |           0 | 2014-02-17 22:28:20 | 2015-04-20 16:22:42 |
+----+-------+--------------------------------+-------+-------------+---------------------+---------------------+
3 rows in set (0.07 sec)

PostgreSQL 9.6.9-R1 / Amazon Aurora(PostgreSQL 9.6.9互換) の場合

items テーブルのidカラムは、serial型を指定して自動的に作成したシーケンスを用いて自動採番します。

以下のクエリで、インスタンスタイプによりますが、r5.largeでも10分程度で16777216レコード生成します。レコード数の調整は、13〜36行のINSERT INTO items (name) select name from items;の行数を増減することで調整します。

-- 大量データ用テーブル
CREATE TABLE items (
  id SERIAL,
  name VARCHAR(16),
  description VARCHAR(30),
  price INT,
  delete_flag boolean,
  created_at timestamp,
  updated_at timestamp,
  PRIMARY KEY (id)
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO items (name) values(null);
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;
INSERT INTO items (name) select name from items;

-- 乱数を用いて値を設定する
UPDATE items SET
  name = CONCAT('item-', id),
  description = SUBSTRING(md5(clock_timestamp()::text), 1, 30)::varchar,
  price = CEIL(random() * 10000),
  delete_flag = mod((random() * 100)::int,2)::boolean,
  created_at = to_date('2016-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD'),
  updated_at = to_date('2017-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD')
;

実際に実行した結果は、以下のとおりです。

postgresql969=> CREATE TABLE items (
postgresql969(>   id SERIAL,
postgresql969(>   name VARCHAR(16),
postgresql969(>   description VARCHAR(30),
postgresql969(>   price INT,
postgresql969(>   delete_flag boolean,
postgresql969(>   created_at timestamp,
postgresql969(>   updated_at timestamp,
postgresql969(>   PRIMARY KEY (id)
postgresql969(> );
CREATE TABLE

postgresql969=> INSERT INTO items (name) values(null);
INSERT 0 1
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 1
  :
 (中略)
  :
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 4194304
postgresql969=> INSERT INTO items (name) select name from items;
INSERT 0 8388608

postgresql969=> UPDATE items SET
postgresql969->   name = CONCAT('item-', id),
postgresql969->   description = SUBSTRING(md5(clock_timestamp()::text), 1, 30)::varchar,
postgresql969->   price = CEIL(random() * 10000),
postgresql969->   delete_flag = mod((random() * 100)::int,2)::boolean,
postgresql969->   created_at = to_date('2016-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD'),
postgresql969->   updated_at = to_date('2017-' || round((random() * (12 - 1))::numeric, 0) + 1 || '-' || round((random() * (31 - 1))::numeric, 0) + 1, 'YYYY-MM-DD')
postgresql969-> ;
UPDATE 16777216

postgresql969=> select * from items limit 3;
 id |  name  |          description           | price | delete_flag |     created_at      |     updated_at
----+--------+--------------------------------+-------+-------------+---------------------+---------------------
  1 | item-1 | 6e99262491738ca5c3e89301772497 |   441 | t           | 2016-01-11 00:00:00 | 2017-02-08 00:00:00
  2 | item-2 | f9901c57a47bf1b30b9da656f1f82a |  4734 | f           | 2016-11-26 00:00:00 | 2017-06-15 00:00:00
  3 | item-3 | 2ce993faf3265c177802e6576ff6e2 |  5755 | t           | 2016-10-10 00:00:00 | 2017-05-13 00:00:00
(3 rows)

Amazon Redshift の場合

items テーブルのidカラムは、IDENTITY(1,1)を指定して自動採番します。

以下のクエリで、インスタンスタイプによりますが、dc2.largeの2ノードクラスタ構成で1分程度で16777216レコード生成します。レコード数の調整は、30〜53行のINSERT INTO temp (name) select name from temp;の行数を増減することで調整します。

PostgreSQLとの相違点は、大量データ作成用一時テーブルにデータを作成した後、大量データ用テーブルにディープコピーしています。ディープコピーしている理由は、データをソートした状態で格納するためです。

-- 大量データ用テーブル
CREATE TABLE items (
  id INT,
  name VARCHAR(16),
  description VARCHAR(30),
  price INT,
  delete_flag boolean,
  created_at timestamp,
  updated_at timestamp,
  PRIMARY KEY (id) 
)
DISTSTYLE EVEN
SORTKEY(id)
;

-- 大量データ作成用一時テーブル
DROP TABLE temp;
CREATE TEMP TABLE temp (
  id INT PRIMARY KEY IDENTITY(1,1),
  name VARCHAR(16),
  description VARCHAR(30),
  price INT,
  delete_flag boolean,
  created_at timestamp,
  updated_at timestamp
);

-- レコード作成(1,2,4,8...16777216と倍々に増加する)
INSERT INTO temp (name) values(null);
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;
INSERT INTO temp (name) select name from temp;

-- 乱数を用いて値を設定する
UPDATE temp SET
  name = CONCAT('item-', id),
  description = SUBSTRING(MD5(random()), 1, 30)::varchar,
  price = CEIL(random() * 10000),
  delete_flag = mod((random() * 100)::int,2),
  created_at = dateadd(s,ceil(random() * 31536000)::int,'2016-01-01'),
  updated_at = dateadd(s,ceil(random() * 31536000)::int,'2017-01-01')
;

-- 大量データ用テーブルにディープコピー(データのソート)
INSERT INTO items SELECT * FROM temp;

実際に実行した結果は、以下のとおりです。

cmdb=# CREATE TABLE items (
cmdb(#   id INT,
cmdb(#   name VARCHAR(16),
cmdb(#   description VARCHAR(30),
cmdb(#   price INT,
cmdb(#   delete_flag boolean,
cmdb(#   created_at timestamp,
cmdb(#   updated_at timestamp,
cmdb(#   PRIMARY KEY (id)
cmdb(# )
cmdb-# DISTSTYLE EVEN
cmdb-# SORTKEY(id)
cmdb-# ;
CREATE TABLE

cmdb=# CREATE TEMP TABLE temp (
cmdb(#   id INT PRIMARY KEY IDENTITY(1,1),
cmdb(#   name VARCHAR(16),
cmdb(#   description VARCHAR(30),
cmdb(#   price INT,
cmdb(#   delete_flag boolean,
cmdb(#   created_at timestamp,
cmdb(#   updated_at timestamp
cmdb(# );
CREATE TABLE

cmdb=# INSERT INTO temp (name) values(null);
INSERT 0 1
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 1
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 2
  :
 (中略)
  :
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 4194304
cmdb=# INSERT INTO temp (name) select name from temp;
INSERT 0 8388608

cmdb=# UPDATE temp SET
cmdb-#   name = CONCAT('item-', id),
cmdb-#   description = SUBSTRING(MD5(random()), 1, 30)::varchar,
cmdb-#   price = CEIL(random() * 10000),
cmdb-#   delete_flag = mod((random() * 100)::int,2),
cmdb-#   created_at = dateadd(s,ceil(random() * 31536000)::int,'2016-01-01'),
cmdb-#   updated_at = dateadd(s,ceil(random() * 31536000)::int,'2017-01-01')
cmdb-# ;
UPDATE 16777216

cmdb=# INSERT INTO items SELECT * FROM temp;
INSERT 0 16777216

cmdb=# select * from items limit 3;
 id |  name   |          description           | price | delete_flag |     created_at      |     updated_at
----+---------+--------------------------------+-------+-------------+---------------------+---------------------
  2 | item-2  | e96dbd1000cbc0b6d8f89595bd3328 |  7756 | f           | 2016-04-23 22:32:24 | 2017-05-02 18:07:17
  3 | item-3  | 49104498585009fe27b1c27735cc58 |  2797 | t           | 2016-08-18 22:32:26 | 2017-04-15 22:25:45
 14 | item-14 | ea0fcb62d24241af8ac711161a083a |  6695 | f           | 2016-12-16 14:24:12 | 2017-06-07 18:03:41
(3 rows)

最後に

もっと良いやり方があるかもしれませんが、私なりのやり方を紹介しました。ザックリとストレージサイズを試算したり、クエリのパフォーマンス検証、DMSのリプリケーション動作の検証などなど、様々な用途に利用できるはずです。