(linenum→info "unix/slp.c:2238")

qemu/0.9.1/block-qcow2.c

    1: /*
    2:  * Block driver for the QCOW version 2 format
    3:  *
    4:  * Copyright (c) 2004-2006 Fabrice Bellard
    5:  *
    6:  * Permission is hereby granted, free of charge, to any person obtaining a copy
    7:  * of this software and associated documentation files (the "Software"), to deal
    8:  * in the Software without restriction, including without limitation the rights
    9:  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   10:  * copies of the Software, and to permit persons to whom the Software is
   11:  * furnished to do so, subject to the following conditions:
   12:  *
   13:  * The above copyright notice and this permission notice shall be included in
   14:  * all copies or substantial portions of the Software.
   15:  *
   16:  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   17:  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   18:  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
   19:  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   20:  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   21:  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   22:  * THE SOFTWARE.
   23:  */
   24: #include "qemu-common.h"
   25: #include "block_int.h"
   26: #include <zlib.h>
   27: #include "aes.h"
   28: #include <assert.h>
   29: 
   30: /*
   31:   Differences with QCOW:
   32: 
   33:   - Support for multiple incremental snapshots.
   34:   - Memory management by reference counts.
   35:   - Clusters which have a reference count of one have the bit
   36:     QCOW_OFLAG_COPIED to optimize write performance.
   37:   - Size of compressed clusters is stored in sectors to reduce bit usage
   38:     in the cluster offsets.
   39:   - Support for storing additional data (such as the VM state) in the
   40:     snapshots.
   41:   - If a backing store is used, the cluster size is not constrained
   42:     (could be backported to QCOW).
   43:   - L2 tables have always a size of one cluster.
   44: */
   45: 
   46: //#define DEBUG_ALLOC
   47: //#define DEBUG_ALLOC2
   48: 
   49: #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
   50: #define QCOW_VERSION 2
   51: 
   52: #define QCOW_CRYPT_NONE 0
   53: #define QCOW_CRYPT_AES  1
   54: 
   55: /* indicate that the refcount of the referenced cluster is exactly one. */
   56: #define QCOW_OFLAG_COPIED     (1LL << 63)
   57: /* indicate that the cluster is compressed (they never have the copied flag) */
   58: #define QCOW_OFLAG_COMPRESSED (1LL << 62)
   59: 
   60: #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
   61: 
   62: #ifndef offsetof
   63: #define offsetof(type, field) ((size_t) &((type *)0)->field)
   64: #endif
   65: 
   66: typedef struct QCowHeader {
   67:     uint32_t magic;
   68:     uint32_t version;
   69:     uint64_t backing_file_offset;
   70:     uint32_t backing_file_size;
   71:     uint32_t cluster_bits;
   72:     uint64_t size; /* in bytes */
   73:     uint32_t crypt_method;
   74:     uint32_t l1_size; /* XXX: save number of clusters instead ? */
   75:     uint64_t l1_table_offset;
   76:     uint64_t refcount_table_offset;
   77:     uint32_t refcount_table_clusters;
   78:     uint32_t nb_snapshots;
   79:     uint64_t snapshots_offset;
   80: } QCowHeader;
   81: 
   82: typedef struct __attribute__((packed)) QCowSnapshotHeader {
   83:     /* header is 8 byte aligned */
   84:     uint64_t l1_table_offset;
   85: 
   86:     uint32_t l1_size;
   87:     uint16_t id_str_size;
   88:     uint16_t name_size;
   89: 
   90:     uint32_t date_sec;
   91:     uint32_t date_nsec;
   92: 
   93:     uint64_t vm_clock_nsec;
   94: 
   95:     uint32_t vm_state_size;
   96:     uint32_t extra_data_size; /* for extension */
   97:     /* extra data follows */
   98:     /* id_str follows */
   99:     /* name follows  */
  100: } QCowSnapshotHeader;
  101: 
  102: #define L2_CACHE_SIZE 16
  103: 
  104: typedef struct QCowSnapshot {
  105:     uint64_t l1_table_offset;
  106:     uint32_t l1_size;
  107:     char *id_str;
  108:     char *name;
  109:     uint32_t vm_state_size;
  110:     uint32_t date_sec;
  111:     uint32_t date_nsec;
  112:     uint64_t vm_clock_nsec;
  113: } QCowSnapshot;
  114: 
  115: typedef struct BDRVQcowState {
  116:     BlockDriverState *hd;
  117:     int cluster_bits;
  118:     int cluster_size;
  119:     int cluster_sectors;
  120:     int l2_bits;
  121:     int l2_size;
  122:     int l1_size;
  123:     int l1_vm_state_index;
  124:     int csize_shift;
  125:     int csize_mask;
  126:     uint64_t cluster_offset_mask;
  127:     uint64_t l1_table_offset;
  128:     uint64_t *l1_table;
  129:     uint64_t *l2_cache;
  130:     uint64_t l2_cache_offsets[L2_CACHE_SIZE];
  131:     uint32_t l2_cache_counts[L2_CACHE_SIZE];
  132:     uint8_t *cluster_cache;
  133:     uint8_t *cluster_data;
  134:     uint64_t cluster_cache_offset;
  135: 
  136:     uint64_t *refcount_table;
  137:     uint64_t refcount_table_offset;
  138:     uint32_t refcount_table_size;
  139:     uint64_t refcount_block_cache_offset;
  140:     uint16_t *refcount_block_cache;
  141:     int64_t free_cluster_index;
  142:     int64_t free_byte_offset;
  143: 
  144:     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
  145:     uint32_t crypt_method_header;
  146:     AES_KEY aes_encrypt_key;
  147:     AES_KEY aes_decrypt_key;
  148:     uint64_t snapshots_offset;
  149:     int snapshots_size;
  150:     int nb_snapshots;
  151:     QCowSnapshot *snapshots;
  152: } BDRVQcowState;
  153: 
  154: static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
  155: static int qcow_read(BlockDriverState *bs, int64_t sector_num,
  156:                      uint8_t *buf, int nb_sectors);
  157: static int qcow_read_snapshots(BlockDriverState *bs);
  158: static void qcow_free_snapshots(BlockDriverState *bs);
  159: static int refcount_init(BlockDriverState *bs);
  160: static void refcount_close(BlockDriverState *bs);
  161: static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
  162: static int update_cluster_refcount(BlockDriverState *bs,
  163:                                    int64_t cluster_index,
  164:                                    int addend);
  165: static void update_refcount(BlockDriverState *bs,
  166:                             int64_t offset, int64_t length,
  167:                             int addend);
  168: static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
  169: static int64_t alloc_bytes(BlockDriverState *bs, int size);
  170: static void free_clusters(BlockDriverState *bs,
  171:                           int64_t offset, int64_t size);
  172: #ifdef DEBUG_ALLOC
  173: static void check_refcounts(BlockDriverState *bs);
  174: #endif
  175: 
  176: static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
  177: {
  178:     const QCowHeader *cow_header = (const void *)buf;
  179: 
  180:     if (buf_size >= sizeof(QCowHeader) &&
  181:         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  182:         be32_to_cpu(cow_header->version) == QCOW_VERSION)
  183:         return 100;
  184:     else
  185:         return 0;
  186: }
  187: 
  188: static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
  189: {
  190:     BDRVQcowState *s = bs->opaque;
  191:     int len, i, shift, ret;
  192:     QCowHeader header;
  193: 
  194:     ret = bdrv_file_open(&s->hd, filename, flags);
  195:     if (ret < 0)
  196:         return ret;
  197:     if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
  198:         goto fail;
  199:     be32_to_cpus(&header.magic);
  200:     be32_to_cpus(&header.version);
  201:     be64_to_cpus(&header.backing_file_offset);
  202:     be32_to_cpus(&header.backing_file_size);
  203:     be64_to_cpus(&header.size);
  204:     be32_to_cpus(&header.cluster_bits);
  205:     be32_to_cpus(&header.crypt_method);
  206:     be64_to_cpus(&header.l1_table_offset);
  207:     be32_to_cpus(&header.l1_size);
  208:     be64_to_cpus(&header.refcount_table_offset);
  209:     be32_to_cpus(&header.refcount_table_clusters);
  210:     be64_to_cpus(&header.snapshots_offset);
  211:     be32_to_cpus(&header.nb_snapshots);
  212: 
  213:     if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
  214:         goto fail;
  215:     if (header.size <= 1 ||
  216:         header.cluster_bits < 9 ||
  217:         header.cluster_bits > 16)
  218:         goto fail;
  219:     if (header.crypt_method > QCOW_CRYPT_AES)
  220:         goto fail;
  221:     s->crypt_method_header = header.crypt_method;
  222:     if (s->crypt_method_header)
  223:         bs->encrypted = 1;
  224:     s->cluster_bits = header.cluster_bits;
  225:     s->cluster_size = 1 << s->cluster_bits;
  226:     s->cluster_sectors = 1 << (s->cluster_bits - 9);
  227:     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
  228:     s->l2_size = 1 << s->l2_bits;
  229:     bs->total_sectors = header.size / 512;
  230:     s->csize_shift = (62 - (s->cluster_bits - 8));
  231:     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
  232:     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
  233:     s->refcount_table_offset = header.refcount_table_offset;
  234:     s->refcount_table_size =
  235:         header.refcount_table_clusters << (s->cluster_bits - 3);
  236: 
  237:     s->snapshots_offset = header.snapshots_offset;
  238:     s->nb_snapshots = header.nb_snapshots;
  239: 
  240:     /* read the level 1 table */
  241:     s->l1_size = header.l1_size;
  242:     shift = s->cluster_bits + s->l2_bits;
  243:     s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
  244:     /* the L1 table must contain at least enough entries to put
  245:        header.size bytes */
  246:     if (s->l1_size < s->l1_vm_state_index)
  247:         goto fail;
  248:     s->l1_table_offset = header.l1_table_offset;
  249:     s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
  250:     if (!s->l1_table)
  251:         goto fail;
  252:     if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
  253:         s->l1_size * sizeof(uint64_t))
  254:         goto fail;
  255:     for(i = 0;i < s->l1_size; i++) {
  256:         be64_to_cpus(&s->l1_table[i]);
  257:     }
  258:     /* alloc L2 cache */
  259:     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
  260:     if (!s->l2_cache)
  261:         goto fail;
  262:     s->cluster_cache = qemu_malloc(s->cluster_size);
  263:     if (!s->cluster_cache)
  264:         goto fail;
  265:     /* one more sector for decompressed data alignment */
  266:     s->cluster_data = qemu_malloc(s->cluster_size + 512);
  267:     if (!s->cluster_data)
  268:         goto fail;
  269:     s->cluster_cache_offset = -1;
  270: 
  271:     if (refcount_init(bs) < 0)
  272:         goto fail;
  273: 
  274:     /* read the backing file name */
  275:     if (header.backing_file_offset != 0) {
  276:         len = header.backing_file_size;
  277:         if (len > 1023)
  278:             len = 1023;
  279:         if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
  280:             goto fail;
  281:         bs->backing_file[len] = '\0';
  282:     }
  283:     if (qcow_read_snapshots(bs) < 0)
  284:         goto fail;
  285: 
  286: #ifdef DEBUG_ALLOC
  287:     check_refcounts(bs);
  288: #endif
  289:     return 0;
  290: 
  291:  fail:
  292:     qcow_free_snapshots(bs);
  293:     refcount_close(bs);
  294:     qemu_free(s->l1_table);
  295:     qemu_free(s->l2_cache);
  296:     qemu_free(s->cluster_cache);
  297:     qemu_free(s->cluster_data);
  298:     bdrv_delete(s->hd);
  299:     return -1;
  300: }
  301: 
  302: static int qcow_set_key(BlockDriverState *bs, const char *key)
  303: {
  304:     BDRVQcowState *s = bs->opaque;
  305:     uint8_t keybuf[16];
  306:     int len, i;
  307: 
  308:     memset(keybuf, 0, 16);
  309:     len = strlen(key);
  310:     if (len > 16)
  311:         len = 16;
  312:     /* XXX: we could compress the chars to 7 bits to increase
  313:        entropy */
  314:     for(i = 0;i < len;i++) {
  315:         keybuf[i] = key[i];
  316:     }
  317:     s->crypt_method = s->crypt_method_header;
  318: 
  319:     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
  320:         return -1;
  321:     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
  322:         return -1;
  323: #if 0
  324:     /* test */
  325:     {
  326:         uint8_t in[16];
  327:         uint8_t out[16];
  328:         uint8_t tmp[16];
  329:         for(i=0;i<16;i++)
  330:             in[i] = i;
  331:         AES_encrypt(in, tmp, &s->aes_encrypt_key);
  332:         AES_decrypt(tmp, out, &s->aes_decrypt_key);
  333:         for(i = 0; i < 16; i++)
  334:             printf(" %02x", tmp[i]);
  335:         printf("\n");
  336:         for(i = 0; i < 16; i++)
  337:             printf(" %02x", out[i]);
  338:         printf("\n");
  339:     }
  340: #endif
  341:     return 0;
  342: }
  343: 
  344: /* The crypt function is compatible with the linux cryptoloop
  345:    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
  346:    supported */
  347: static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
  348:                             uint8_t *out_buf, const uint8_t *in_buf,
  349:                             int nb_sectors, int enc,
  350:                             const AES_KEY *key)
  351: {
  352:     union {
  353:         uint64_t ll[2];
  354:         uint8_t b[16];
  355:     } ivec;
  356:     int i;
  357: 
  358:     for(i = 0; i < nb_sectors; i++) {
  359:         ivec.ll[0] = cpu_to_le64(sector_num);
  360:         ivec.ll[1] = 0;
  361:         AES_cbc_encrypt(in_buf, out_buf, 512, key,
  362:                         ivec.b, enc);
  363:         sector_num++;
  364:         in_buf += 512;
  365:         out_buf += 512;
  366:     }
  367: }
  368: 
  369: static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
  370:                         uint64_t cluster_offset, int n_start, int n_end)
  371: {
  372:     BDRVQcowState *s = bs->opaque;
  373:     int n, ret;
  374: 
  375:     n = n_end - n_start;
  376:     if (n <= 0)
  377:         return 0;
  378:     ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
  379:     if (ret < 0)
  380:         return ret;
  381:     if (s->crypt_method) {
  382:         encrypt_sectors(s, start_sect + n_start,
  383:                         s->cluster_data,
  384:                         s->cluster_data, n, 1,
  385:                         &s->aes_encrypt_key);
  386:     }
  387:     ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
  388:                      s->cluster_data, n);
  389:     if (ret < 0)
  390:         return ret;
  391:     return 0;
  392: }
  393: 
  394: static void l2_cache_reset(BlockDriverState *bs)
  395: {
  396:     BDRVQcowState *s = bs->opaque;
  397: 
  398:     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
  399:     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
  400:     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
  401: }
  402: 
  403: static inline int l2_cache_new_entry(BlockDriverState *bs)
  404: {
  405:     BDRVQcowState *s = bs->opaque;
  406:     uint32_t min_count;
  407:     int min_index, i;
  408: 
  409:     /* find a new entry in the least used one */
  410:     min_index = 0;
  411:     min_count = 0xffffffff;
  412:     for(i = 0; i < L2_CACHE_SIZE; i++) {
  413:         if (s->l2_cache_counts[i] < min_count) {
  414:             min_count = s->l2_cache_counts[i];
  415:             min_index = i;
  416:         }
  417:     }
  418:     return min_index;
  419: }
  420: 
  421: static int64_t align_offset(int64_t offset, int n)
  422: {
  423:     offset = (offset + n - 1) & ~(n - 1);
  424:     return offset;
  425: }
  426: 
  427: static int grow_l1_table(BlockDriverState *bs, int min_size)
  428: {
  429:     BDRVQcowState *s = bs->opaque;
  430:     int new_l1_size, new_l1_size2, ret, i;
  431:     uint64_t *new_l1_table;
  432:     uint64_t new_l1_table_offset;
  433:     uint64_t data64;
  434:     uint32_t data32;
  435: 
  436:     new_l1_size = s->l1_size;
  437:     if (min_size <= new_l1_size)
  438:         return 0;
  439:     while (min_size > new_l1_size) {
  440:         new_l1_size = (new_l1_size * 3 + 1) / 2;
  441:     }
  442: #ifdef DEBUG_ALLOC2
  443:     printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
  444: #endif
  445: 
  446:     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
  447:     new_l1_table = qemu_mallocz(new_l1_size2);
  448:     if (!new_l1_table)
  449:         return -ENOMEM;
  450:     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
  451: 
  452:     /* write new table (align to cluster) */
  453:     new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
  454: 
  455:     for(i = 0; i < s->l1_size; i++)
  456:         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
  457:     ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
  458:     if (ret != new_l1_size2)
  459:         goto fail;
  460:     for(i = 0; i < s->l1_size; i++)
  461:         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
  462: 
  463:     /* set new table */
  464:     data64 = cpu_to_be64(new_l1_table_offset);
  465:     if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset),
  466:                     &data64, sizeof(data64)) != sizeof(data64))
  467:         goto fail;
  468:     data32 = cpu_to_be32(new_l1_size);
  469:     if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size),
  470:                     &data32, sizeof(data32)) != sizeof(data32))
  471:         goto fail;
  472:     qemu_free(s->l1_table);
  473:     free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
  474:     s->l1_table_offset = new_l1_table_offset;
  475:     s->l1_table = new_l1_table;
  476:     s->l1_size = new_l1_size;
  477:     return 0;
  478:  fail:
  479:     qemu_free(s->l1_table);
  480:     return -EIO;
  481: }
  482: 
  483: /* 'allocate' is:
  484:  *
  485:  * 0 not to allocate.
  486:  *
  487:  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
  488:  * 'n_end')
  489:  *
  490:  * 2 to allocate a compressed cluster of size
  491:  * 'compressed_size'. 'compressed_size' must be > 0 and <
  492:  * cluster_size
  493:  *
  494:  * return 0 if not allocated.
  495:  */
  496: static uint64_t get_cluster_offset(BlockDriverState *bs,
  497:                                    uint64_t offset, int allocate,
  498:                                    int compressed_size,
  499:                                    int n_start, int n_end)
  500: {
  501:     BDRVQcowState *s = bs->opaque;
  502:     int min_index, i, j, l1_index, l2_index, ret;
  503:     uint64_t l2_offset, *l2_table, cluster_offset, tmp, old_l2_offset;
  504: 
  505:     l1_index = offset >> (s->l2_bits + s->cluster_bits);
  506:     if (l1_index >= s->l1_size) {
  507:         /* outside l1 table is allowed: we grow the table if needed */
  508:         if (!allocate)
  509:             return 0;
  510:         if (grow_l1_table(bs, l1_index + 1) < 0)
  511:             return 0;
  512:     }
  513:     l2_offset = s->l1_table[l1_index];
  514:     if (!l2_offset) {
  515:         if (!allocate)
  516:             return 0;
  517:     l2_allocate:
  518:         old_l2_offset = l2_offset;
  519:         /* allocate a new l2 entry */
  520:         l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
  521:         /* update the L1 entry */
  522:         s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
  523:         tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
  524:         if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
  525:                         &tmp, sizeof(tmp)) != sizeof(tmp))
  526:             return 0;
  527:         min_index = l2_cache_new_entry(bs);
  528:         l2_table = s->l2_cache + (min_index << s->l2_bits);
  529: 
  530:         if (old_l2_offset == 0) {
  531:             memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
  532:         } else {
  533:             if (bdrv_pread(s->hd, old_l2_offset,
  534:                            l2_table, s->l2_size * sizeof(uint64_t)) !=
  535:                 s->l2_size * sizeof(uint64_t))
  536:                 return 0;
  537:         }
  538:         if (bdrv_pwrite(s->hd, l2_offset,
  539:                         l2_table, s->l2_size * sizeof(uint64_t)) !=
  540:             s->l2_size * sizeof(uint64_t))
  541:             return 0;
  542:     } else {
  543:         if (!(l2_offset &a