バックアップ戦略、しましょうか

Posted on Sun, Jul 28, 2013 in Code tagged with Amazon Web Services, Python

はじめに

こんばんは yosida95 です。生きています。

ぼくの自室環境では、仮想マシンのイメージや Mac の TimeMachine のバックアップイメージなどを RAID-Z2 on OpenIndiana (WD Red 使用)なストレージサーバーに保管しています。

そのためハードウェア障害程度には耐えられそうですが、何があるのか分からないのでさらにバックアップするに越したことはありません。そこで、 Amazon Web Services が提供している Amazon Glacier に保管することにしました。

いつものアレ

というわけで、バッチを書きました。バックアップが存在しないものはバックアップを行い、すでにバックアップが存在するものは modify time と SHA-1 ハッシュを前回バックアップ時のものと比較して、変更があればバックアップを行い、変更がなければバックアップをスキップします。

# -*- coding: utf-8 -*-

import argparse
import fcntl
import hashlib
import logging
import os
import re
import unicodedata

from boto.glacier import connect_to_region as connect_to_glacier
from boto.sdb import connect_to_region as connect_to_sdb

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter(u'[%(asctime)s][%(levelname)s] %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)


class GlacierBackuper(object):

    def __init__(self, aws_access_key_id, aws_secret_access_key,
                 region, glacier_vault, sdb_domain,
                 pattern, ignore_pattern=None):
        self.glacier = connect_to_glacier(
            region, aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        ).get_vault(glacier_vault)
        self.sdb = connect_to_sdb(
            region, aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        ).get_domain(sdb_domain)
        self.pattern = pattern
        self.ignore_pattern = ignore_pattern

    def _compute_hash(self, fd):
        seek_point = fd.tell()
        result = hashlib.sha1()

        fd.seek(0)
        buf = fd.read(1 * 2 ** 20)  # 1MiB
        while buf:
            result.update(buf)
            buf = fd.read(2 ** 10)
        else:
            fd.seek(seek_point)

        return result.hexdigest()

    def archive(self, path):
        assert os.path.isabs(path)

        if self.ignore_pattern and self.ignore_pattern.search(path) or\
                self.pattern.search(path) is None:
            logger.info(u'skip archiving of %s' % (path, ))
            return

        logger.info(u'start archiving of %s' % (path, ))

        fd = open(path, u'a+b')
        fd.seek(0)
        fcntl.flock(fd, fcntl.LOCK_EX)

        logger.debug(u'computing hash')
        hash = self._compute_hash(fd)

        logger.debug(u'getting modified time of file to archive')
        mtime = os.path.getmtime(path)

        query = u"select * from %s where `filepath` = '%s'" % (
            self.sdb.name, path.replace('\'', '\'\'')
        )
        rows = self.sdb.select(query)
        try:
            logger.debug(query)
            metadata = next(rows)
        except StopIteration:
            logger.info(u'archive not found')
            is_archive_needed = True
        else:
            if float(metadata[u'mtime']) == mtime and\
                    metadata[u'hash'] == hash:
                logger.info(u'archive has already exist and is up to date')
                is_archive_needed = False
            else:
                logger.info(u'archive has already exist'
                            u' but file has been modified')
                is_archive_needed = True
                self.glacier.delete_archive(metadata.name)
                metadata.delete()

        if is_archive_needed is False:
            return True

        writer = self.glacier.create_archive_writer()
        buf = fd.read(1 * 2 ** 20)
        while buf:
            writer.write(buf)
            buf = fd.read(1 * 2 ** 20)
        writer.close()

        metadata = self.sdb.new_item(writer.get_archive_id())
        metadata[u'filepath'] = path
        metadata[u'mtime'] = mtime
        metadata[u'hash'] = hash
        metadata.save()

        fcntl.flock(fd.fileno(), fcntl.LOCK_UN)
        fd.close()

        logger.info(u'archiving has finished: %s' % (path, ))
        return True

    def archive_dir(self, dirname, archived=[]):
        for filename in os.listdir(dirname):
            filename = unicodedata.normalize(u'NFC', filename)

            try:
                filepath = os.path.join(dirname, filename)
                if os.path.isfile(filepath):
                    self.archive(filepath) and archived.append(filepath)
                elif os.path.isdir(filepath):
                    self.archive_dir(filepath, archived)
            except OSError:
                continue
        else:
            return archived


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(u'--access_id',
                        help=u'AWS Access Key ID', required=True)
    parser.add_argument(u'--access_secret',
                        help=u'AWS Secret Access Key', required=True)
    parser.add_argument(u'--aws_region', help=u'Region Name', required=True)
    parser.add_argument(u'--glacier_vault',
                        help=u'Glacier Vault Name', required=True)
    parser.add_argument(u'--sdb_domain',
                        help=u'SimpleDB Domain Name', required=True)
    parser.add_argument(u'--pattern', default=ur'.+',
                        help=u'RegExp pattern which match filepath to upload')
    parser.add_argument(u'--ignore-pattern', default=u'',
                        help=u'RegExp pattern'
                             u' which match filepath to skip uploading')

    return parser.parse_args()


def main():
    args = get_args()

    backuper = GlacierBackuper(args.access_id, args.access_secret,
                               args.aws_region, args.glacier_vault,
                               args.sdb_domain,
                               re.compile(unicode(args.pattern),
                                          re.UNICODE | re.IGNORECASE),
                               args.ignore_pattern and
                               re.compile(unicode(args.ignore_pattern),
                                          re.UNICODE | re.IGNORECASE))
    backuper.archive_dir(unicode(os.getcwd()))


if __name__ == u'__main__':
    main()

Amazon Glacier

Amazon Glacier は大容量なデータを低価格で保管できるストレージサービスです。月額 0.01USD/GB で使えます。

1ファイルをアーカイブという1単位とし、そのアーカイブを Vault という容れ物に格納する仕組みです。ただし、アーカイブや Vault は階層構造を持てない上、もともとのファイルパスや modify time といった付加情報を保持できません。アーカイブには一意な ID が割り当てられ、この ID によって取り出しなどの操作を行います。

そこで、同じく Amazon Web Services が提供している Amazon SimpleDB に、バックアップ元のファイルパスと modify time 、そして SHA-1 ハッシュと、それに対応するアーカイブ ID を記録しています。

また、 Amazon Glacier には「アーカイブの更新」という概念がないため、このバッチでは変更があったアーカイブについては古いものを削除した上で新たにアーカイブを作っています。ここで注意が必要なのは、 Amazon Glacier は長期間のファイル保存を前提として作られているサービスであるため、作られてから90日経過していないアーカイブを削除すると "Early Delete" として、アーカイブサイズに応じた料金を請求されます。

Amazon Glacier のもっと詳しい特徴などについては各々調べて下さい。

最後に

夏休みに入ったし、 LeapMotion も手に入れたので、また近いうちに記事書きます。

yosida95

著者について / About yosida95

ここで述べる内容は私個人の見解に基づくものであり、私の雇用者や私が所属する団体とは一切の関係がありません。

Follow @yosida95

バックアップ戦略、しましょうか

はじめに

いつものアレ

Amazon Glacier

最後に

yosida95

Recent Articles

Categories

Archives