commit 1150b29fbe76efc9767f35744c341c6276b6556b Author: gabest11 Date: Thu Jul 22 04:28:38 2010 +0000 Just uploading some initial code, it can read objects until the root dir. No error checking and little-endian only. diff --git a/common.props b/common.props new file mode 100644 index 0000000..4fdc8d0 --- /dev/null +++ b/common.props @@ -0,0 +1,25 @@ + + + + + + $(SolutionDir)bin\$(Configuration)\ + false + + + + ProgramDatabase + Level3 + WIN32;_WINDOWS;RESTRICT=__restrict;_USE_MATH_DEFINES;%(PreprocessorDefinitions) + 4091;4995;4996;4793;4100;4512 + true + true + true + false + + + true + + + + \ No newline at end of file diff --git a/debug.props b/debug.props new file mode 100644 index 0000000..dd5b97b --- /dev/null +++ b/debug.props @@ -0,0 +1,16 @@ + + + + + + + Disabled + MultiThreadedDebugDLL + _DEBUG;DEBUG;%(PreprocessorDefinitions) + + + true + + + + \ No newline at end of file diff --git a/release.props b/release.props new file mode 100644 index 0000000..44508ce --- /dev/null +++ b/release.props @@ -0,0 +1,26 @@ + + + + + + + MaxSpeed + AnySuitable + true + Speed + true + true + MultiThreadedDLL + _SECURE_SCL=0;NDEBUG;%(PreprocessorDefinitions) + false + + + true + true + + + true + + + + \ No newline at end of file diff --git a/zfs-win.sln b/zfs-win.sln new file mode 100644 index 0000000..92a79c8 --- /dev/null +++ b/zfs-win.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zfs-win", "zfs-win\zfs-win.vcxproj", "{4A7767E8-F121-4CA9-9147-D3B29A0F831E}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4A7767E8-F121-4CA9-9147-D3B29A0F831E}.Debug|Win32.ActiveCfg = Debug|Win32 + {4A7767E8-F121-4CA9-9147-D3B29A0F831E}.Debug|Win32.Build.0 = Debug|Win32 + {4A7767E8-F121-4CA9-9147-D3B29A0F831E}.Release|Win32.ActiveCfg = Release|Win32 + {4A7767E8-F121-4CA9-9147-D3B29A0F831E}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/zfs-win/BlockReader.cpp b/zfs-win/BlockReader.cpp new file mode 100644 index 0000000..80b59b5 --- /dev/null +++ b/zfs-win/BlockReader.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "BlockReader.h" +#include "Compress.h" + +namespace ZFS +{ + BlockReader::BlockReader(Pool* pool, blkptr_t* bp, size_t count) + : m_pool(pool) + { + Insert(bp, count); + } + + BlockReader::~BlockReader() + { + for(auto i = m_bpl.begin(); i != m_bpl.end(); i++) + { + delete *i; + } + } + + bool BlockReader::ReadNext(std::vector& buff) + { + if(m_bpl.empty()) + { + return false; + } + + std::auto_ptr bp(m_bpl.front()); + + m_bpl.pop_front(); + + while(bp->lvl > 0) + { + if(!Read(bp.get(), buff)) + { + return false; + } + + Insert((blkptr_t*)buff.data(), buff.size() / sizeof(blkptr_t)); + + bp = std::auto_ptr(m_bpl.front()); + + m_bpl.pop_front(); + } + + return Read(bp.get(), buff); + } + + bool BlockReader::ReadToEnd(std::vector& dst) + { + dst.clear(); + + // TODO: resize/reserve (how much?) + + size_t i = 0; + + std::vector buff; + + while(ReadNext(buff)) + { + if(i + buff.size() > dst.size()) + { + dst.resize(i + buff.size()); + } + + memcpy(dst.data() + i, buff.data(), buff.size()); + + i += buff.size(); + } + + return true; + } + + void BlockReader::Insert(blkptr_t* bp, size_t count) + { + if(m_bpl.empty()) + { + for(size_t i = 0; i < count && bp[i].type != DMU_OT_NONE; i++) + { + m_bpl.push_back(new blkptr_t(bp[i])); + } + } + else + { + std::list l; + + for(size_t i = 0; i < count && bp[i].type != DMU_OT_NONE; i++) + { + l.push_back(new blkptr_t(bp[i])); + } + + m_bpl.insert(m_bpl.begin(), l.begin(), l.end()); + } + } + + bool BlockReader::Read(blkptr_t* bp, std::vector& dst) + { + for(int i = 0; i < 3; i++) + { + dva_t* addr = &bp->blk_dva[i]; + + ASSERT(addr->gang == 0); // TODO: zio_gbh_phys_t (not used ??? never encountered one, yet) + + for(auto i = m_pool->m_vdevs.begin(); i != m_pool->m_vdevs.end(); i++) + { + VirtualDevice* vdev = *i; + + if(vdev->id != addr->vdev) + { + continue; + } + + size_t psize = (size_t)(bp->psize + 1) << 9; + size_t lsize = (size_t)(bp->lsize + 1) << 9; + + std::vector src(psize); + + vdev->Read(src, psize, addr->offset << 9); + + // TODO: verify bp->chksum + + switch(bp->comp) + { + case ZIO_COMPRESS_ON: + case ZIO_COMPRESS_LZJB: + dst.resize(lsize); + lzjb_decompress(src.data(), dst.data(), psize, lsize); + break; + case ZIO_COMPRESS_OFF: + dst.swap(src); + break; + default: // TODO: gzip, zle + ASSERT(0); + return false; + } + + return true; + } + } + + return false; + } +} diff --git a/zfs-win/BlockReader.h b/zfs-win/BlockReader.h new file mode 100644 index 0000000..d79c2e3 --- /dev/null +++ b/zfs-win/BlockReader.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "zfs.h" +#include "Pool.h" +#include "Device.h" + +namespace ZFS +{ + class BlockReader + { + Pool* m_pool; + std::list m_bpl; + + void Insert(blkptr_t* bp, size_t count); + bool Read(blkptr_t* bp, std::vector& buff); + + public: + BlockReader(Pool* pool, blkptr_t* bp, size_t count); + virtual ~BlockReader(); + + bool ReadNext(std::vector& buff); + bool ReadToEnd(std::vector& buff); + + // TODO: function to read only part of the whole data (for large files) + }; +} \ No newline at end of file diff --git a/zfs-win/Compress.cpp b/zfs-win/Compress.cpp new file mode 100644 index 0000000..e554056 --- /dev/null +++ b/zfs-win/Compress.cpp @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "stdafx.h" +#include "Compress.h" + +#define NBBY 8 +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 1024 + +/* + * We keep our own copy of this algorithm for 3 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * 3. We initialize the lempel to ensure deterministic results, + * so that identical blocks can always be deduplicated. + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. + */ + +size_t lzjb_compress(void* s_start, void* d_start, size_t s_len, size_t d_len) +{ + uint8_t* src = (uint8_t*)s_start; + uint8_t* dst = (uint8_t*)d_start; + uint8_t* cpy; + uint8_t* copymap; + int copymask = 1 << (NBBY - 1); + int mlen, offset, hash; + uint16_t* hp; + uint16_t lempel[LEMPEL_SIZE] = { 0 }; + + while(src < (uint8_t*)s_start + s_len) + { + if((copymask <<= 1) == (1 << NBBY)) + { + if(dst >= (uint8_t *)d_start + d_len - 1 - 2 * NBBY) + { + return s_len; + } + + copymask = 1; + copymap = dst; + *dst++ = 0; + } + + if(src > (uint8_t*)s_start + s_len - MATCH_MAX) + { + *dst++ = *src++; + + continue; + } + + hash = (src[0] << 16) + (src[1] << 8) + src[2]; + hash += hash >> 9; + hash += hash >> 5; + hp = &lempel[hash & (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + + if(cpy >= (uint8_t *)s_start && cpy != src && src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) + { + *copymap |= copymask; + + for(mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + { + if(src[mlen] != cpy[mlen]) + { + break; + } + } + + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | (offset >> NBBY); + *dst++ = (uint8_t)offset; + src += mlen; + } + else + { + *dst++ = *src++; + } + } + + return dst - (uint8_t*)d_start; +} + +int lzjb_decompress(void* s_start, void* d_start, size_t s_len, size_t d_len) +{ + uint8_t* src = (uint8_t*)s_start; + uint8_t* dst = (uint8_t*)d_start; + uint8_t* d_end = (uint8_t*)d_start + d_len; + uint8_t* cpy; + uint8_t copymap; + int copymask = 1 << (NBBY - 1); + + while(dst < d_end) + { + if((copymask <<= 1) == (1 << NBBY)) + { + copymask = 1; + copymap = *src++; + } + + if(copymap & copymask) + { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + + src += 2; + + if((cpy = dst - offset) < (uint8_t*)d_start) + { + return -1; + } + + while(--mlen >= 0 && dst < d_end) + { + *dst++ = *cpy++; + } + } + else + { + *dst++ = *src++; + } + } + + return 0; +} diff --git a/zfs-win/Compress.h b/zfs-win/Compress.h new file mode 100644 index 0000000..b23d39c --- /dev/null +++ b/zfs-win/Compress.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma once + +extern size_t lzjb_compress(void* s_start, void* d_start, size_t s_len, size_t d_len); +extern int lzjb_decompress(void* s_start, void* d_start, size_t s_len, size_t d_len); + +// TODO: gzip, zle diff --git a/zfs-win/Device.cpp b/zfs-win/Device.cpp new file mode 100644 index 0000000..cffe888 --- /dev/null +++ b/zfs-win/Device.cpp @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "Device.h" + +namespace ZFS +{ + // VirtualDevice + + void VirtualDevice::Init(NameValueList* nvl) + { + dev = NULL; + type = nvl->at("type")->str[0]; + id = nvl->at("id")->u64[0]; + guid = nvl->at("guid")->u64[0]; + metaslab_array = nvl->find("metaslab_array") != nvl->end() ? nvl->at("metaslab_array")->u64[0] : 0; + metaslab_shift = nvl->find("metaslab_shift") != nvl->end() ? nvl->at("metaslab_shift")->u64[0] : 0; + ashift = nvl->find("ashift") != nvl->end() ? nvl->at("ashift")->u64[0] : 0; + asize = nvl->find("asize") != nvl->end() ? nvl->at("asize")->u64[0] : 0; + if(nvl->find("path") != nvl->end()) path = nvl->at("path")->str[0]; + if(nvl->find("devid") != nvl->end()) devid = nvl->at("devid")->str[0]; + nparity = nvl->find("nparity") != nvl->end() ? nvl->at("nparity")->u64[0] : 0; + whole_disk = nvl->find("whole_disk") != nvl->end() ? nvl->at("whole_disk")->u64[0] : 0; + is_log = nvl->find("is_log") != nvl->end() ? nvl->at("is_log")->u64[0] : 0; + + if(nvl->find("children") != nvl->end()) + { + NameValuePair* nvp = nvl->at("children"); + + children.resize(nvp->count); + + for(uint32_t i = 0; i < nvp->count; i++) + { + children[i].Init(&nvp->list[i]); + } + } + } + + bool VirtualDevice::Read(std::vector& buff, uint64_t size, uint64_t offset) + { + // TODO: handle chksum errors + + buff.resize((size_t)size); + + if(type == "disk" || type == "file") + { + if(dev != NULL) + { + dev->Seek(offset + 0x400000); + + if(dev->Read(buff.data(), size) == size) + { + return true; + } + } + } + else if(type == "mirror") + { + for(auto i = children.begin(); i != children.end(); i++) + { + VirtualDevice& vdev = *i; + + if(vdev.dev != NULL) + { + dev->Seek(offset + 0x400000); + + if(dev->Read(buff.data(), size) == size) + { + return true; + } + } + } + } + else if(type == "raidz") + { + raidz_map_t rm(offset, size, (uint32_t)ashift, children.size(), (uint32_t)nparity); + + uint64_t total = 0; + + for(size_t i = 1; i < rm.m_col.size(); i++) + { + total += rm.m_col[i].size; + } + + if(total > buff.size()) + { + return false; + } + + uint8_t* p = buff.data(); + + for(size_t i = 1; i < rm.m_col.size(); i++) + { + VirtualDevice& vdev = children[(size_t)rm.m_col[i].devidx]; + + // TODO: reconstruct data if vdev.dev is missing or Read fails + + if(vdev.dev != NULL) + { + vdev.dev->Seek(rm.m_col[i].offset + 0x400000); + + if(!vdev.dev->Read(p, rm.m_col[i].size) != rm.m_col[i].size) + { + return false; + } + } + + p += rm.m_col[i].size; + } + + return true; + } + else + { + ASSERT(0); + } + + return false; + } + + VirtualDevice* VirtualDevice::Find(uint64_t guid_to_find) + { + if(guid == guid_to_find) + { + return this; + } + + for(auto i = children.begin(); i != children.end(); i++) + { + VirtualDevice* vdev = i->Find(guid_to_find); + + if(vdev != NULL) + { + return vdev; + } + } + + return NULL; + } + + void VirtualDevice::GetLeaves(std::list& leaves) + { + if(children.empty()) + { + leaves.push_back(this); + } + else + { + for(auto i = children.begin(); i != children.end(); i++) + { + i->GetLeaves(leaves); + } + } + } + + // Device + + Device::Device() + : m_handle(NULL) + , m_start(0) + , m_size(0) + , m_label(NULL) + , m_active(NULL) + { + } + + Device::~Device() + { + Close(); + } + + bool Device::Open(const wchar_t* path, uint32_t partition) + { + Close(); + + m_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, OPEN_EXISTING, FILE_FLAG_SEQUENTIAL_SCAN, (HANDLE)NULL); + + if(m_handle == INVALID_HANDLE_VALUE) + { + m_handle = NULL; + + return false; + } + + if(!GetFileSizeEx(m_handle, (LARGE_INTEGER*)&m_size)) + { + DISK_GEOMETRY_EX dg; + DWORD sz; + + if(DeviceIoControl(m_handle, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, NULL, 0, &dg, sizeof(dg), &sz, NULL)) + { + m_size = dg.DiskSize.QuadPart; + } + } + + for(int i = 0; i < 2; i++, partition >>= 8) + { + uint8_t mbr[0x200]; + + Read(mbr, sizeof(mbr)); + + if(mbr[0x1fe] == 0x55 || mbr[0x1ff] == 0xaa) + { + uint8_t* ptr = &mbr[0x1be]; + + for(int i = 0; i < 4; i++) + { + if((partition & 0xff) == i) + { + uint64_t start = *(uint32_t*)&ptr[i * 16 + 8]; + uint64_t size = *(uint32_t*)&ptr[i * 16 + 12]; + + if(start != 0 && size != 0) + { + m_start += start << 9; + m_size = size << 9; + } + + break; + } + } + } + + Seek(0); + } + + m_label = new vdev_label_t(); + + Read(m_label, sizeof(vdev_label_t)); + + // TODO: verify m_label->vdev_phys.zbt.chksum + + if(!m_desc.Init(m_label->vdev_phys)) + { + return false; + } + + for(size_t i = 0; i < sizeof(m_label->uberblock); i += m_desc.ub_size) + { + uberblock_t* ub = (uberblock_t*)&m_label->uberblock[i]; + + if(ub->magic == BSWAP_64(UBERBLOCK_MAGIC)) + { + // TODO: be <-> le + } + + if(ub->magic != UBERBLOCK_MAGIC) + { + continue; + } + + if(m_active == NULL || ub->txg > m_active->txg) + { + m_active = ub; + } + } + + return true; + } + + void Device::Close() + { + if(m_handle != NULL) + { + CloseHandle(m_handle); + + m_handle = NULL; + } + + if(m_label != NULL) + { + delete m_label; + + m_label = NULL; + } + + m_start = 0; + m_size = 0; + + m_active = NULL; + } + + uint64_t Device::Seek(uint64_t pos) + { + LARGE_INTEGER li, li2; + + li.QuadPart = m_start + pos; + + SetFilePointerEx(m_handle, li, &li2, FILE_BEGIN); + + return li.QuadPart; + } + + size_t Device::Read(void* buff, uint64_t size) + { + DWORD read = 0; + + ReadFile(m_handle, buff, (DWORD)size, &read, NULL); + + return (size_t)read; + } + + // DeviceDesc + + bool DeviceDesc::Init(vdev_phys_t& vd) + { + NameValueList nvl; + + nvl.Read(&vd.nvlist[4], sizeof(vd.nvlist) - 4); + + try + { + guid = nvl.at("guid")->u64[0]; + top_guid = nvl.at("top_guid")->u64[0]; + state = nvl.at("state")->u64[0]; + host.id = nvl.at("hostid")->u64[0]; + host.name = nvl.at("hostname")->str[0]; + pool.guid = nvl.at("pool_guid")->u64[0]; + pool.name = nvl.at("name")->str[0]; + txg = nvl.at("txg")->u64[0]; + version = nvl.at("version")->u64[0]; + top.Init(nvl.at("vdev_tree")->list); + ub_size = 1 << std::max((int)top.ashift, UBERBLOCK_SHIFT); + } + catch(...) + { + return false; + } + + return true; + } +} diff --git a/zfs-win/Device.h b/zfs-win/Device.h new file mode 100644 index 0000000..ba4e0a5 --- /dev/null +++ b/zfs-win/Device.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "zfs.h" +#include "NameValueList.h" + +namespace ZFS +{ + class Device; + + class VirtualDevice + { + public: + Device* dev; + std::string type; + uint64_t id; + uint64_t guid; + uint64_t metaslab_array; + uint64_t metaslab_shift; + uint64_t ashift; + uint64_t asize; + std::string path; + std::string devid; + uint64_t nparity; + uint64_t whole_disk; + uint64_t is_log; + std::vector children; + + void Init(NameValueList* nvl) throw(...); + bool Read(std::vector& buff, uint64_t size, uint64_t offset); + VirtualDevice* Find(uint64_t guid_to_find); + void GetLeaves(std::list& leaves); + }; + + class DeviceDesc + { + public: + uint64_t guid; + uint64_t top_guid; + uint64_t state; + struct {uint64_t id; std::string name;} host; + struct {uint64_t guid; std::string name;} pool; + uint64_t txg; + uint64_t version; + VirtualDevice top; + size_t ub_size; + + bool Init(vdev_phys_t& vd); + }; + + class Device + { + public: + DeviceDesc m_desc; + HANDLE m_handle; + uint64_t m_start; + uint64_t m_size; + vdev_label_t* m_label; + uberblock_t* m_active; + + public: + Device(); + virtual ~Device(); + + bool Open(const wchar_t* path, uint32_t partition = 0); // partition 0x0000EEPP (PP primary, EE extended, zero based index) + void Close(); + + uint64_t Seek(uint64_t pos); + size_t Read(void* buff, uint64_t size); + }; +} \ No newline at end of file diff --git a/zfs-win/Hash.cpp b/zfs-win/Hash.cpp new file mode 100644 index 0000000..77e54a5 --- /dev/null +++ b/zfs-win/Hash.cpp @@ -0,0 +1,247 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Fletcher Checksums + * ------------------ + * + * ZFS's 2nd and 4th order Fletcher checksums are defined by the following + * recurrence relations: + * + * a = a + f + * i i-1 i-1 + * + * b = b + a + * i i-1 i + * + * c = c + b (fletcher-4 only) + * i i-1 i + * + * d = d + c (fletcher-4 only) + * i i-1 i + * + * Where + * a_0 = b_0 = c_0 = d_0 = 0 + * and + * f_0 .. f_(n-1) are the input data. + * + * Using standard techniques, these translate into the following series: + * + * __n_ __n_ + * \ | \ | + * a = > f b = > i * f + * n /___| n - i n /___| n - i + * i = 1 i = 1 + * + * + * __n_ __n_ + * \ | i*(i+1) \ | i*(i+1)*(i+2) + * c = > ------- f d = > ------------- f + * n /___| 2 n - i n /___| 6 n - i + * i = 1 i = 1 + * + * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. + * Since the additions are done mod (2^64), errors in the high bits may not + * be noticed. For this reason, fletcher-2 is deprecated. + * + * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. + * A conservative estimate of how big the buffer can get before we overflow + * can be estimated using f_i = 0xffffffff for all i: + * + * % bc + * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 + * 2264 + * quit + * % + * + * So blocks of up to 2k will not overflow. Our largest block size is + * 128k, which has 32k 4-byte words, so we can compute the largest possible + * accumulators, then divide by 2^64 to figure the max amount of overflow: + * + * % bc + * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } + * a/2^64;b/2^64;c/2^64;d/2^64 + * 0 + * 0 + * 1365 + * 11186858 + * quit + * % + * + * So a and b cannot overflow. To make sure each bit of input has some + * effect on the contents of c and d, we can look at what the factors of + * the coefficients in the equations for c_n and d_n are. The number of 2s + * in the factors determines the lowest set bit in the multiplier. Running + * through the cases for n*(n+1)/2 reveals that the highest power of 2 is + * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow + * the 64-bit accumulators, every bit of every f_i effects every accumulator, + * even for 128k blocks. + * + * If we wanted to make a stronger version of fletcher4 (fletcher4c?), + * we could do our calculations mod (2^32 - 1) by adding in the carries + * periodically, and store the number of carries in the top 32-bits. + * + * -------------------- + * Checksum Performance + * -------------------- + * + * There are two interesting components to checksum performance: cached and + * uncached performance. With cached data, fletcher-2 is about four times + * faster than fletcher-4. With uncached data, the performance difference is + * negligible, since the cost of a cache fill dominates the processing time. + * Even though fletcher-4 is slower than fletcher-2, it is still a pretty + * efficient pass over the data. + * + * In normal operation, the data which is being checksummed is in a buffer + * which has been filled either by: + * + * 1. a compression step, which will be mostly cached, or + * 2. a bcopy() or copyin(), which will be uncached (because the + * copy is cache-bypassing). + * + * For both cached and uncached data, both fletcher checksums are much faster + * than sha-256, and slower than 'off', which doesn't touch the data at all. + */ + +#include "stdafx.h" +#include "Hash.h" + +void fletcher_2_native(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint64_t* ip = (const uint64_t*)buf; + const uint64_t* ipend = ip + (size / sizeof(uint64_t)); + + uint64_t a0, b0, a1, b1; + + for(a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) + { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + zcp->set(a0, a1, b0, b1); +} + +void fletcher_2_byteswap(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint64_t* ip = (const uint64_t*)buf; + const uint64_t* ipend = ip + (size / sizeof(uint64_t)); + + uint64_t a0, b0, a1, b1; + + for(a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) + { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + zcp->set(a0, a1, b0, b1); +} + +void fletcher_4_native(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint32_t* ip = (const uint32_t*)buf; + const uint32_t* ipend = ip + (size / sizeof(uint32_t)); + + uint64_t a, b, c, d; + + for(a = b = c = d = 0; ip < ipend; ip++) + { + a += ip[0]; + b += a; + c += b; + d += c; + } + + zcp->set(a, b, c, d); +} + +void fletcher_4_byteswap(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint32_t* ip = (const uint32_t*)buf; + const uint32_t* ipend = ip + (size / sizeof(uint32_t)); + + uint64_t a, b, c, d; + + for(a = b = c = d = 0; ip < ipend; ip++) + { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + zcp->set(a, b, c, d); +} + +void fletcher_4_incremental_native(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint32_t* ip = (const uint32_t*)buf; + const uint32_t* ipend = ip + (size / sizeof(uint32_t)); + + uint64_t a, b, c, d; + + a = zcp->word[0]; + b = zcp->word[1]; + c = zcp->word[2]; + d = zcp->word[3]; + + for(; ip < ipend; ip++) + { + a += ip[0]; + b += a; + c += b; + d += c; + } + + zcp->set(a, b, c, d); +} + +void fletcher_4_incremental_byteswap(const void* buf, uint64_t size, cksum_t* zcp) +{ + const uint32_t* ip = (const uint32_t*)buf; + const uint32_t* ipend = ip + (size / sizeof(uint32_t)); + + uint64_t a, b, c, d; + + a = zcp->word[0]; + b = zcp->word[1]; + c = zcp->word[2]; + d = zcp->word[3]; + + for(; ip < ipend; ip++) + { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + zcp->set(a, b, c, d); +} diff --git a/zfs-win/Hash.h b/zfs-win/Hash.h new file mode 100644 index 0000000..7333cfe --- /dev/null +++ b/zfs-win/Hash.h @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Fletcher Checksums + * ------------------ + * + * ZFS's 2nd and 4th order Fletcher checksums are defined by the following + * recurrence relations: + * + * a = a + f + * i i-1 i-1 + * + * b = b + a + * i i-1 i + * + * c = c + b (fletcher-4 only) + * i i-1 i + * + * d = d + c (fletcher-4 only) + * i i-1 i + * + * Where + * a_0 = b_0 = c_0 = d_0 = 0 + * and + * f_0 .. f_(n-1) are the input data. + * + * Using standard techniques, these translate into the following series: + * + * __n_ __n_ + * \ | \ | + * a = > f b = > i * f + * n /___| n - i n /___| n - i + * i = 1 i = 1 + * + * + * __n_ __n_ + * \ | i*(i+1) \ | i*(i+1)*(i+2) + * c = > ------- f d = > ------------- f + * n /___| 2 n - i n /___| 6 n - i + * i = 1 i = 1 + * + * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. + * Since the additions are done mod (2^64), errors in the high bits may not + * be noticed. For this reason, fletcher-2 is deprecated. + * + * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. + * A conservative estimate of how big the buffer can get before we overflow + * can be estimated using f_i = 0xffffffff for all i: + * + * % bc + * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 + * 2264 + * quit + * % + * + * So blocks of up to 2k will not overflow. Our largest block size is + * 128k, which has 32k 4-byte words, so we can compute the largest possible + * accumulators, then divide by 2^64 to figure the max amount of overflow: + * + * % bc + * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } + * a/2^64;b/2^64;c/2^64;d/2^64 + * 0 + * 0 + * 1365 + * 11186858 + * quit + * % + * + * So a and b cannot overflow. To make sure each bit of input has some + * effect on the contents of c and d, we can look at what the factors of + * the coefficients in the equations for c_n and d_n are. The number of 2s + * in the factors determines the lowest set bit in the multiplier. Running + * through the cases for n*(n+1)/2 reveals that the highest power of 2 is + * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow + * the 64-bit accumulators, every bit of every f_i effects every accumulator, + * even for 128k blocks. + * + * If we wanted to make a stronger version of fletcher4 (fletcher4c?), + * we could do our calculations mod (2^32 - 1) by adding in the carries + * periodically, and store the number of carries in the top 32-bits. + * + * -------------------- + * Checksum Performance + * -------------------- + * + * There are two interesting components to checksum performance: cached and + * uncached performance. With cached data, fletcher-2 is about four times + * faster than fletcher-4. With uncached data, the performance difference is + * negligible, since the cost of a cache fill dominates the processing time. + * Even though fletcher-4 is slower than fletcher-2, it is still a pretty + * efficient pass over the data. + * + * In normal operation, the data which is being checksummed is in a buffer + * which has been filled either by: + * + * 1. a compression step, which will be mostly cached, or + * 2. a bcopy() or copyin(), which will be uncached (because the + * copy is cache-bypassing). + * + * For both cached and uncached data, both fletcher checksums are much faster + * than sha-256, and slower than 'off', which doesn't touch the data at all. + */ + +#include "stdafx.h" +#include "zfs.h" + +extern void fletcher_2_native(const void* buf, uint64_t size, cksum_t* zcp); +extern void fletcher_2_byteswap(const void* buf, uint64_t size, cksum_t* zcp); +extern void fletcher_4_native(const void* buf, uint64_t size, cksum_t* zcp); +extern void fletcher_4_byteswap(const void* buf, uint64_t size, cksum_t* zcp); +extern void fletcher_4_incremental_native(const void* buf, uint64_t size, cksum_t* zcp); +extern void fletcher_4_incremental_byteswap(const void* buf, uint64_t size, cksum_t* zcp); + +// TODO: sha256 \ No newline at end of file diff --git a/zfs-win/NameValueList.cpp b/zfs-win/NameValueList.cpp new file mode 100644 index 0000000..c0c1662 --- /dev/null +++ b/zfs-win/NameValueList.cpp @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "NameValueList.h" + +namespace ZFS +{ + NameValuePair::NameValuePair() + { + u8 = NULL; + } + + NameValuePair::~NameValuePair() + { + if(u8 != NULL) + { + switch(type) + { + case TYPE_INT8: + case TYPE_INT8_ARRAY: + case TYPE_UINT8: + case TYPE_UINT8_ARRAY: + delete [] u8; + break; + case TYPE_INT16: + case TYPE_INT16_ARRAY: + case TYPE_UINT16: + case TYPE_UINT16_ARRAY: + delete [] u16; + break; + case TYPE_INT32: + case TYPE_INT32_ARRAY: + case TYPE_UINT32: + case TYPE_UINT32_ARRAY: + delete [] u32; + break; + case TYPE_INT64: + case TYPE_INT64_ARRAY: + case TYPE_UINT64: + case TYPE_UINT64_ARRAY: + delete [] u64; + break; + case TYPE_STRING: + case TYPE_STRING_ARRAY: + delete [] str; + break; + case TYPE_NVLIST: + case TYPE_NVLIST_ARRAY: + delete [] list; + break; + default: + break; + } + + u8 = NULL; + } + } + + NameValueList::NameValueList() + { + } + + NameValueList::~NameValueList() + { + for(auto i = begin(); i != end(); i++) + { + delete i->second; + } + } + + const uint8_t* NameValueList::Read(const uint8_t* ptr, size_t size) + { + const uint8_t* ptr_end = ptr + size; + + uint32_t version = ReadU32(ptr); + uint32_t flags = ReadU32(ptr); + + while(ptr < ptr_end) + { + const uint8_t* ptr_start = ptr; + + uint32_t esize = ReadU32(ptr); + uint32_t dsize = ReadU32(ptr); + + if(esize == 0 && dsize == 0) break; + + NameValuePair* pair = new NameValuePair(); + + std::string name = ReadString(ptr); + + pair->type = (NameValueType)ReadU32(ptr); + pair->count = ReadU32(ptr); + + if(pair->count > 0) + { + switch(pair->type) + { + case TYPE_BOOLEAN: + case TYPE_BOOLEAN_ARRAY: // ??? + break; + case TYPE_BYTE: + case TYPE_BYTE_ARRAY: // ??? + break; + case TYPE_INT8: + case TYPE_INT8_ARRAY: + case TYPE_UINT8: + case TYPE_UINT8_ARRAY: + pair->u8 = new uint8_t[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + pair->u8[i] = ReadU8(ptr); + break; + case TYPE_INT16: + case TYPE_INT16_ARRAY: + case TYPE_UINT16: + case TYPE_UINT16_ARRAY: + pair->u16 = new uint16_t[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + pair->u16[i] = ReadU16(ptr); + break; + case TYPE_INT32: + case TYPE_INT32_ARRAY: + case TYPE_UINT32: + case TYPE_UINT32_ARRAY: + pair->u32 = new uint32_t[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + pair->u32[i] = ReadU32(ptr); + break; + case TYPE_INT64: + case TYPE_INT64_ARRAY: + case TYPE_UINT64: + case TYPE_UINT64_ARRAY: + pair->u64 = new uint64_t[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + pair->u64[i] = ReadU64(ptr); + break; + case TYPE_STRING: + case TYPE_STRING_ARRAY: + pair->str = new std::string[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + pair->str[i] = ReadString(ptr); + break; + case TYPE_NVLIST: + case TYPE_NVLIST_ARRAY: + pair->list = new NameValueList[pair->count]; + for(uint32_t i = 0; i < pair->count; i++) + ptr = pair->list[i].Read(ptr, ptr_start + esize - ptr); + break; + case TYPE_BOOLEAN_VALUE: // ??? + ASSERT(0); + break; + case TYPE_HRTIME: // ??? + ASSERT(0); + break; + case TYPE_DOUBLE: // ??? + ASSERT(0); + break; + case TYPE_UNKNOWN: + default: + ASSERT(0); + break; + } + + auto i = find(name); + + if(i != end()) + { + delete i->second; + + erase(i); + } + + (*this)[name] = pair; + } + + ptr = ptr_start + esize; + } + + return ptr; + } + + uint8_t NameValueList::ReadU8(const uint8_t*& ptr) + { + uint8_t v = ptr[0]; + + ptr += 1; + + return v; + } + + uint16_t NameValueList::ReadU16(const uint8_t*& ptr) + { + uint16_t v = (ptr[0] << 8) | ptr[1]; + + ptr += 2; + + return v; + } + + uint32_t NameValueList::ReadU32(const uint8_t*& ptr) + { + uint32_t v = (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | (ptr[3] << 0); + + ptr += 4; + + return v; + } + + uint64_t NameValueList::ReadU64(const uint8_t*& ptr) + { + uint64_t v; + + ((uint32_t*)&v)[1] = (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | ptr[3]; + ((uint32_t*)&v)[0] = (ptr[4] << 24) | (ptr[5] << 16) | (ptr[6] << 8) | ptr[7]; + + ptr += 8; + + return v; + } + + std::string NameValueList::ReadString(const uint8_t*& ptr) + { + uint32_t size = ReadU32(ptr); + + std::string s((const char*)ptr, size); + + ptr += (size + 3) & ~3; + + return s; + } +} \ No newline at end of file diff --git a/zfs-win/NameValueList.h b/zfs-win/NameValueList.h new file mode 100644 index 0000000..f5baf7f --- /dev/null +++ b/zfs-win/NameValueList.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +namespace ZFS +{ + enum NameValueType + { + TYPE_UNKNOWN = 0, + TYPE_BOOLEAN, + TYPE_BYTE, + TYPE_INT16, + TYPE_UINT16, + TYPE_INT32, + TYPE_UINT32, + TYPE_INT64, + TYPE_UINT64, + TYPE_STRING, + TYPE_BYTE_ARRAY, + TYPE_INT16_ARRAY, + TYPE_UINT16_ARRAY, + TYPE_INT32_ARRAY, + TYPE_UINT32_ARRAY, + TYPE_INT64_ARRAY, + TYPE_UINT64_ARRAY, + TYPE_STRING_ARRAY, + TYPE_HRTIME, + TYPE_NVLIST, + TYPE_NVLIST_ARRAY, + TYPE_BOOLEAN_VALUE, + TYPE_INT8, + TYPE_UINT8, + TYPE_BOOLEAN_ARRAY, + TYPE_INT8_ARRAY, + TYPE_UINT8_ARRAY, + TYPE_DOUBLE + }; + + class NameValueList; + + class NameValuePair + { + public: + NameValueType type; + uint32_t count; + + union + { + int8_t* i8; + uint8_t* u8; + int16_t* i16; + uint16_t* u16; + int32_t* i32; + uint32_t* u32; + int64_t* i64; + uint64_t* u64; + std::string* str; + NameValueList* list; + }; + + NameValuePair(); + virtual ~NameValuePair(); + }; + + class NameValueList : public std::map + { + uint8_t ReadU8(const uint8_t*& ptr); + uint16_t ReadU16(const uint8_t*& ptr); + uint32_t ReadU32(const uint8_t*& ptr); + uint64_t ReadU64(const uint8_t*& ptr); + std::string ReadString(const uint8_t*& ptr); + + public: + NameValueList(); + virtual ~NameValueList(); + + const uint8_t* Read(const uint8_t* ptr, size_t size); + }; +} \ No newline at end of file diff --git a/zfs-win/Pool.cpp b/zfs-win/Pool.cpp new file mode 100644 index 0000000..e6ef115 --- /dev/null +++ b/zfs-win/Pool.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "Pool.h" +#include "BlockReader.h" + +namespace ZFS +{ + Pool::Pool() + : m_guid(0) + { + } + + Pool::~Pool() + { + Close(); + } + + bool Pool::Open(const char* name, const std::list& paths) + { + Close(); + + m_name = name; + + for(auto i = paths.begin(); i != paths.end(); i++) + { + Device* dev = new Device(); + + if(!dev->Open(i->c_str())) + { + return false; + } + + if(m_name == dev->m_desc.pool.name && (m_guid == 0 || m_guid == dev->m_desc.pool.guid)) + { + m_guid = dev->m_desc.pool.guid; + + m_devs.push_back(dev); + + auto cmp_tree = [&] (const VirtualDevice* vdev) -> bool {return vdev->guid == dev->m_desc.top.guid;}; + + if(std::find_if(m_vdevs.begin(), m_vdevs.end(), cmp_tree) == m_vdevs.end()) + { + m_vdevs.push_back(&dev->m_desc.top); + } + } + else + { + delete dev; + } + } + + if(m_devs.empty()) + { + return false; + } + + for(auto i = m_vdevs.begin(); i != m_vdevs.end(); i++) + { + std::list leaves; + + VirtualDevice* vdev = *i; + + vdev->GetLeaves(leaves); + + for(auto j = leaves.begin(); j != leaves.end(); j++) + { + VirtualDevice* leaf = *j; + + for(auto k = m_devs.begin(); k != m_devs.end(); k++) + { + Device* dev = *k; + + if(leaf->guid == dev->m_desc.guid) + { + ASSERT(vdev->guid == dev->m_desc.top_guid); + + leaf->dev = dev; + + break; + } + } + + if(leaf->dev == NULL) + { + return false; + } + } + } + + return true; + } + + void Pool::Close() + { + for(auto i = m_devs.begin(); i != m_devs.end(); i++) + { + delete *i; + } + + m_guid = 0; + m_name.clear(); + m_devs.clear(); + m_vdevs.clear(); + } + + bool Pool::Read(std::vector& buff, blkptr_t* bp, size_t count) + { + BlockReader r(this, bp, count); + + return r.ReadToEnd(buff); + } +} diff --git a/zfs-win/Pool.h b/zfs-win/Pool.h new file mode 100644 index 0000000..76f57ec --- /dev/null +++ b/zfs-win/Pool.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "zfs.h" +#include "Device.h" + +namespace ZFS +{ + class Pool + { + public: + uint64_t m_guid; + std::string m_name; + std::vector m_devs; + std::vector m_vdevs; + + public: + Pool(); + virtual ~Pool(); + + bool Open(const char* name, const std::list& paths); + void Close(); + + bool Read(std::vector& buff, blkptr_t* bp, size_t count); + }; +} \ No newline at end of file diff --git a/zfs-win/ZapObject.cpp b/zfs-win/ZapObject.cpp new file mode 100644 index 0000000..3dc0bc9 --- /dev/null +++ b/zfs-win/ZapObject.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "StdAfx.h" +#include "ZapObject.h" + +namespace ZFS +{ + ZapObject::ZapObject(std::vector& buff) + { + if(buff.size() >= sizeof(uint64_t)) + { + uint64_t* ptr = (uint64_t*)buff.data(); + + if(*ptr == ZBT_MICRO) ParseMicro(buff); + else if(*ptr == ZBT_HEADER) ParseFat(buff); + } + } + + ZapObject::~ZapObject() + { + for(auto i = begin(); i != end(); i++) + { + delete i->second; + } + } + + bool ZapObject::Lookup(const char* name, uint64_t& value) + { + auto i = find(name); + + if(i != end()) + { + if(i->second->size() == 8) + { + value = BSWAP_64(*(uint64_t*)i->second->data()); // fat zap big endian??? + + return true; + } + } + + return false; + } + + bool ZapObject::Lookup(const char* name, std::string& value) + { + auto i = find(name); + + if(i != end()) + { + value = std::string((char*)i->second->data(), i->second->size()); + + return true; + } + + return false; + } + + void ZapObject::ParseMicro(std::vector& buff) + { + mzap_phys_t* mzap = (mzap_phys_t*)buff.data(); + + for(size_t i = 0, n = buff.size() / MZAP_ENT_LEN - 1; i < n; i++) + { + std::string name = mzap->chunk[i].name; + + if(name.empty()) continue; + + auto j = find(name); + + if(j != end()) + { + delete j->second; + + erase(j); + } + + std::vector* value = new std::vector(sizeof(uint64_t)); + + uint64_t tmp = BSWAP_64(mzap->chunk[i].value); // make the same as fat zap + + memcpy(value->data(), &tmp, sizeof(uint64_t)); + + (*this)[name] = value; + } + } + + void ZapObject::ParseFat(std::vector& buff) + { + size_t half_size = buff.size() / 2; // first half wasted ??? + + zap_phys_t* zap = (zap_phys_t*)buff.data(); + zap_leaf_phys_t* leaf = (zap_leaf_phys_t*)(buff.data() + half_size); + + zap_leaf_entry_t* e = (zap_leaf_entry_t*)(uint8_t*)&leaf->hash[half_size / 32]; + zap_leaf_entry_t* e_end = (zap_leaf_entry_t*)(buff.data() + buff.size()); + + for(size_t i = 0, n = e_end - e; i < n; i++) + { + if(e[i].type != ZAP_CHUNK_ENTRY) + { + continue; + } + + std::vector name(e[i].name_numints); + + if(!ParseArray(name, e, e[i].name_chunk) || name.empty()) + { + continue; + } + + std::vector* value = new std::vector(e[i].value_numints * e[i].value_intlen); + + if(!ParseArray(*value, e, e[i].value_chunk)) + { + delete value; + + continue; + } + + std::string s((char*)name.data(), name.size() - 1); + + auto j = find(s); + + if(j != end()) + { + delete j->second; + + erase(j); + } + + (*this)[s] = value; + } + } + + bool ZapObject::ParseArray(std::vector& buff, zap_leaf_entry_t* e, uint16_t index) + { + uint8_t* ptr = buff.data(); + size_t size = buff.size(); + + while(index != 0xffff) + { + zap_leaf_array_t* l = (zap_leaf_array_t*)&e[index]; + + if(l->type != ZAP_CHUNK_ARRAY) + { + ASSERT(0); + + break; + } + + size_t n = std::min(size, ZAP_LEAF_ARRAY_BYTES); + + memcpy(ptr, l->buff, n); + + ptr += n; + size -= n; + + index = l->next; + } + + return size == 0; + } +} \ No newline at end of file diff --git a/zfs-win/ZapObject.h b/zfs-win/ZapObject.h new file mode 100644 index 0000000..82c14b4 --- /dev/null +++ b/zfs-win/ZapObject.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "zfs.h" + +namespace ZFS +{ + class ZapObject : public std::map*> + { + void ParseMicro(std::vector& buff); + void ParseFat(std::vector& buff); + bool ParseArray(std::vector& buff, zap_leaf_entry_t* e, uint16_t index); + + public: + ZapObject(std::vector& buff); + virtual ~ZapObject(); + + bool Lookup(const char* name, uint64_t& value); + bool Lookup(const char* name, std::string& value); + }; +} \ No newline at end of file diff --git a/zfs-win/main.cpp b/zfs-win/main.cpp new file mode 100644 index 0000000..fe8dbd6 --- /dev/null +++ b/zfs-win/main.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "Pool.h" +#include "Device.h" +#include "ZapObject.h" + +int _tmain(int argc, _TCHAR* argv[]) +{ + // this is just a test, recreating the steps of "ZFS On-Disk Data Walk (Or: Where's My Data)" (google for it) + + std::list paths; + /* + for(int i = 2; i < argc; i++) + { + paths.push_back(argv[i]); + } + */ + + const char* name = "mpool"; + + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM1-flat.vmdk"); + /* + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM2-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM3-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM4-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM5-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM6-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM7-flat.vmdk"); + paths.push_back(L"D:\\Virtual Machines\\ZFSVM\\ZFSVM8-flat.vmdk"); + */ + /* + const char* name = "share"; + + paths.push_back(L"\\\\.\\PhysicalDrive1"); + paths.push_back(L"\\\\.\\PhysicalDrive2"); + paths.push_back(L"\\\\.\\PhysicalDrive3"); + paths.push_back(L"\\\\.\\PhysicalDrive4"); + */ + /* + const char* name = "rpool"; + + paths.push_back(L"D:\\Virtual Machines\\OpenSolaris\\OpenSolaris-flat.vmdk"); + */ + ZFS::Pool p; + + if(!p.Open(name, paths)) + { + return -1; + } + + ZFS::Device* dev = p.m_devs.front(); + + if(dev->m_active->rootbp.type == DMU_OT_OBJSET) + { + std::vector buff; + + if(p.Read(buff, &dev->m_active->rootbp, 1)) + { + objset_phys_t* os = (objset_phys_t*)buff.data(); + + if(os->type == DMU_OST_META && os->meta_dnode.type == DMU_OT_DNODE) + { + std::vector buff; + + if(p.Read(buff, os->meta_dnode.blkptr, os->meta_dnode.nblkptr)) + { + dnode_phys_t* dn = (dnode_phys_t*)buff.data(); + + size_t count = buff.size() / sizeof(dnode_phys_t); + + ASSERT(count > 2); + + dnode_phys_t* root_dataset = NULL; + + ASSERT(dn[1].type == DMU_OT_OBJECT_DIRECTORY); + + if(dn[1].type == DMU_OT_OBJECT_DIRECTORY) + { + std::vector buff; + + if(p.Read(buff, dn[1].blkptr, dn[1].nblkptr)) + { + ZFS::ZapObject zap(buff); + + uint64_t index; + + if(zap.Lookup("root_dataset", index)) + { + if(index < count && dn[index].type == DMU_OT_DSL_DIR) + { + root_dataset = &dn[index]; + } + } + } + } + + dnode_phys_t* head_dataset = NULL; + + if(root_dataset != NULL) + { + dsl_dir_phys_t* dir = (dsl_dir_phys_t*)root_dataset->bonus; + + size_t index = (size_t)dir->head_dataset_obj; + + if(index < count && dn[index].type == DMU_OT_DSL_DATASET) + { + head_dataset = &dn[index]; + } + } + + if(head_dataset != NULL) + { + dsl_dataset_phys_t* ds = (dsl_dataset_phys_t*)head_dataset->bonus; + + if(ds->bp.type == DMU_OT_OBJSET) + { + std::vector buff; + + if(p.Read(buff, &ds->bp, 1)) + { + objset_phys_t* os = (objset_phys_t*)buff.data(); + + if(os->type == DMU_OST_ZFS && os->meta_dnode.type == DMU_OT_DNODE) + { + std::vector buff; + + if(p.Read(buff, os->meta_dnode.blkptr, os->meta_dnode.nblkptr)) + { + dnode_phys_t* dn = (dnode_phys_t*)buff.data(); + + size_t count = buff.size() / sizeof(dnode_phys_t); + + ASSERT(count > 2); + + dnode_phys_t* root = NULL; + + ASSERT(dn[1].type == DMU_OT_MASTER_NODE); + + if(dn[1].type == DMU_OT_MASTER_NODE) + { + std::vector buff; + + if(p.Read(buff, dn[1].blkptr, dn[1].nblkptr)) + { + ZFS::ZapObject zap(buff); + + uint64_t index; + + if(zap.Lookup("ROOT", index)) // NOTE: the ROOT dataset may not contain too many files, don't be surprised + { + if(index < count && dn[index].type == DMU_OT_DIRECTORY_CONTENTS) + { + root = &dn[index]; + } + else + { + ASSERT(0); + } + } + } + } + + if(root != NULL) + { + znode_phys_t* node = (znode_phys_t*)root->bonus; + + std::vector buff; + + if(p.Read(buff, root->blkptr, root->nblkptr)) + { + mzap_phys_t* mzap = (mzap_phys_t*)buff.data(); + + // finally, arrived at the root directory + + int i = 0; + } + } + } + } + } + } + } + } + } + } + } + + return 0; +} + diff --git a/zfs-win/stdafx.cpp b/zfs-win/stdafx.cpp new file mode 100644 index 0000000..5bec69f --- /dev/null +++ b/zfs-win/stdafx.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +// stdafx.cpp : source file that includes just the standard includes +// zfs-win.pch will be the pre-compiled header +// stdafx.obj will contain the pre-compiled type information + +#include "stdafx.h" + +// TODO: reference any additional headers you need in STDAFX.H +// and not in this file diff --git a/zfs-win/stdafx.h b/zfs-win/stdafx.h new file mode 100644 index 0000000..b622737 --- /dev/null +++ b/zfs-win/stdafx.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +// stdafx.h : include file for standard system include files, +// or project specific include files that are used frequently, but +// are changed infrequently +// + +#pragma once + +#include "targetver.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef ASSERT + #if defined(_DEBUG) && defined(_MSC_VER) + #include + #define ASSERT assert + #else + #define ASSERT(exp) ((void)0) + #endif +#endif + +// TODO: reference additional headers your program requires here diff --git a/zfs-win/targetver.h b/zfs-win/targetver.h new file mode 100644 index 0000000..a8c66e2 --- /dev/null +++ b/zfs-win/targetver.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2010 Gabest + * http://code.google.com/p/zfs-win/ + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and +// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. + +#include diff --git a/zfs-win/zfs-win.vcxproj b/zfs-win/zfs-win.vcxproj new file mode 100644 index 0000000..56480ca --- /dev/null +++ b/zfs-win/zfs-win.vcxproj @@ -0,0 +1,93 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {4A7767E8-F121-4CA9-9147-D3B29A0F831E} + Win32Proj + zfs-win + + + + Application + true + Unicode + + + Application + false + Unicode + true + + + + + + + + + + + + + + + + + + + + Use + _CONSOLE;%(PreprocessorDefinitions) + + + Console + + + + + Use + _CONSOLE;%(PreprocessorDefinitions) + + + Console + + + + + + + + + + + + + + + + + + + + + Create + Create + + + + + + + + + + \ No newline at end of file diff --git a/zfs-win/zfs-win.vcxproj.filters b/zfs-win/zfs-win.vcxproj.filters new file mode 100644 index 0000000..d424819 --- /dev/null +++ b/zfs-win/zfs-win.vcxproj.filters @@ -0,0 +1,78 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/zfs-win/zfs.h b/zfs-win/zfs.h new file mode 100644 index 0000000..5cce57f --- /dev/null +++ b/zfs-win/zfs.h @@ -0,0 +1,1112 @@ +/* + * This file contains the *_phys_t structs and misc macros gathered from the Solaris ZFS driver + */ + +#pragma once + +#pragma pack(push, 1) + +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) + +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - sizeof (zio_eck_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - sizeof (zio_eck_t) - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) / sizeof (uint64_t)) +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +union dva_t +{ + struct + { + struct {uint32_t asize:24; uint32_t grid:8;}; + uint32_t vdev; + struct {uint64_t offset:63; uint64_t gang:1;}; + }; + + uint64_t word[2]; +}; + +struct cksum_t +{ + uint64_t word[4]; + + void set(uint64_t a, uint64_t b, uint64_t c, uint64_t d) + { + word[0] = a; + word[1] = b; + word[2] = c; + word[3] = d; + } +}; + +/* + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * B byteorder (endianness) + * D dedup + * X unused + * lvl level of indirection + * type DMU object type + * phys birth txg of block allocation; zero if same as logical birth txg + * log. birth transaction group in which the block was logically born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ + +struct blkptr_t +{ + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + + union /* size, compression, type, etc */ + { + struct + { + uint16_t lsize; + uint16_t psize; + uint8_t comp; + uint8_t chksum; + uint8_t type; + struct {uint8_t lvl:5; uint8_t x:1; uint8_t d:1; uint8_t b:1;}; + }; + + uint64_t prop; + }; + + uint64_t pad[2]; /* Extra space for the future */ + uint64_t phys_birth; /* txg when block was allocated */ + uint64_t birth; /* transaction group at birth */ + uint64_t fill; /* fill count */ + cksum_t cksum; /* 256-bit checksum */ +}; + +#define ZEC_MAGIC 0x210da7ab10c7a11ULL + +struct zio_eck_t +{ + uint64_t magic; /* for validation, endianness (ZEC_MAGIC) */ + cksum_t cksum; /* 256-bit checksum */ +}; + +struct zio_gbh_phys_t +{ + blkptr_t blkptr[SPA_GBH_NBLKPTRS]; + uint64_t filler[SPA_GBH_FILLER]; + zio_eck_t tail; +}; + +enum zio_checksum +{ + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 +#define ZIO_DEDUPDITTO_MIN 100 + +enum zio_compress +{ + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define BOOTFS_COMPRESS_VALID(c) ((c) == ZIO_COMPRESS_LZJB || ((c) == ZIO_COMPRESS_ON && ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || (c) == ZIO_COMPRESS_OFF) + +// + +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ + +#define VDEV_BOOT_OFFSET (2 * sizeof(vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ + +#define VDEV_LABEL_START_SIZE (2 * sizeof(vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof(vdev_label_t)) +#define VDEV_LABELS 4 + + +struct vdev_phys_t +{ + uint8_t nvlist[VDEV_PHYS_SIZE - sizeof(zio_eck_t)]; + zio_eck_t zbt; +}; + +struct vdev_label_t /* 256K total */ +{ + uint8_t pad1[VDEV_PAD_SIZE]; /* 8K */ + uint8_t pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_phys_t vdev_phys; /* 112K */ + uint8_t uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +}; + +#define UBERBLOCK_MAGIC 0x00bab10cULL /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock_t +{ + uint64_t magic; /* UBERBLOCK_MAGIC */ + uint64_t version; /* SPA_VERSION */ + uint64_t txg; /* txg of last sync */ + uint64_t guid_sum; /* sum of all vdev guids */ + uint64_t timestamp; /* UTC time of last sync */ + blkptr_t rootbp; /* MOS objset_phys_t */ +}; + +// + +#define OBJSET_PHYS_SIZE 2048 +#define OBJSET_OLD_PHYS_SIZE 1024 +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) + +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) + +enum dmu_object_type +{ + DMU_OT_NONE = 0, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP = 8, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ /* bonus = dsl_dir_phys_t */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET = 16, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP = 24, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_history_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS = 32, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA = 40, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_NUMTYPES +}; + +enum dmu_objset_type +{ + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +}; + +struct zil_header_t +{ + uint64_t claim_txg; /* txg in which log blocks were claimed */ + uint64_t replay_seq; /* highest replayed sequence number */ + blkptr_t log; /* log chain */ + uint64_t claim_blk_seq; /* highest claimed block sequence number */ + uint64_t flags; /* header flags */ + uint64_t claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t pad[3]; +}; + +struct dnode_phys_t +{ + uint8_t type; /* dmu_object_type_t */ + uint8_t indblkshift; /* ln2(indirect block size) */ + uint8_t nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t nblkptr; /* length of dn_blkptr */ + uint8_t bonustype; /* type of data in bonus buffer */ + uint8_t checksum; /* ZIO_CHECKSUM type */ + uint8_t compress; /* ZIO_COMPRESS type */ + uint8_t flags; /* DNODE_FLAG_* */ + uint16_t datablkszsec; /* data block size in 512b sectors */ + uint16_t bonuslen; /* length of dn_bonus */ + uint8_t pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t maxblkid; /* largest allocated block ID */ + uint64_t used; /* bytes (or sectors) of disk space */ + + uint64_t pad3[4]; + + blkptr_t blkptr[1]; + uint8_t bonus[DN_MAX_BONUSLEN]; +}; + +struct objset_phys_t +{ + union + { + struct + { + dnode_phys_t meta_dnode; + zil_header_t zil_header; + uint64_t type; + uint64_t flags; + }; + + uint8_t pad[OBJSET_PHYS_SIZE]; + }; + + dnode_phys_t userused_dnode; + dnode_phys_t groupused_dnode; +}; + +#define DS_FLAG_INCONSISTENT (1ULL<<0) + +/* + * NB: nopromote can not yet be set, but we want support for it in this + * on-disk version, so that we don't need to upgrade for it later. It + * will be needed when we implement 'zfs split' (where the split off + * clone should not be promoted). + */ + +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ + +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ + +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ + +#define DS_FLAG_CI_DATASET (1ULL<<16) + +#define DS_IS_INCONSISTENT(ds) ((ds)->phys->flags & DS_FLAG_INCONSISTENT) +#define DS_IS_DEFER_DESTROY(ds) ((ds)->phys->flags & DS_FLAG_DEFER_DESTROY) + +struct dsl_dataset_phys_t +{ + uint64_t dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t prev_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t prev_snap_txg; + uint64_t next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ + uint64_t num_children; /* clone/snap children; ==0 for head */ + uint64_t creation_time; /* seconds since 1970 */ + uint64_t creation_txg; + uint64_t deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t used_bytes; + uint64_t compressed_bytes; + uint64_t uncompressed_bytes; + uint64_t unique_bytes; /* only relevant to snapshots */ + + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + + uint64_t fsid_guid; + uint64_t guid; + uint64_t flags; /* DS_FLAG_* */ + blkptr_t bp; + uint64_t next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t pad[5]; /* pad out to 320 bytes for good measure */ +}; + +enum dd_used +{ + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +}; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + +struct dsl_dir_phys_t +{ + uint64_t creation_time; /* not actually used */ + uint64_t head_dataset_obj; + uint64_t parent_obj; + uint64_t origin_obj; + uint64_t child_dir_zapobj; + + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + + uint64_t used_bytes; + uint64_t compressed_bytes; + uint64_t uncompressed_bytes; + + /* Administrative quota setting */ + + uint64_t quota; + + /* Administrative reservation setting */ + + uint64_t reserved; + uint64_t props_zapobj; + uint64_t deleg_zapobj; /* dataset delegation permissions */ + uint64_t flags; // DD_FLAG_USED_BREAKDOWN + uint64_t used_breakdown[DD_USED_NUM]; + uint64_t pad[14]; /* pad out to 256 bytes for good measure */ +}; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) + +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +struct mzap_ent_phys_t +{ + uint64_t value; + uint32_t cd; + uint16_t pad; /* in case we want to chain them someday */ + char name[MZAP_NAME_LEN]; +}; + +struct mzap_phys_t +{ + uint64_t block_type; /* ZBT_MICRO */ + uint64_t salt; + uint64_t normflags; + uint64_t pad[5]; + mzap_ent_phys_t chunk[1]; + + /* actually variable size depending on block size */ +}; + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +/* any other values are ptrtbl blocks */ + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->f.block_shift) + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ + +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<f.phys)[(idx) + (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ + +struct zap_table_phys_t +{ + uint64_t blk; /* starting block number */ + uint64_t numblks; /* number of blocks */ + uint64_t shift; /* bits to index it */ + uint64_t nextblk; /* next (larger) copy start block */ + uint64_t blks_copied; /* number source blocks copied */ +}; + +struct zap_phys_t +{ + uint64_t block_type; /* ZBT_HEADER */ + uint64_t magic; /* ZAP_MAGIC */ + zap_table_phys_t ptrtbl; + uint64_t freeblk; /* the next free block */ + uint64_t num_leafs; /* number of leafs */ + uint64_t num_entries; /* number of entries */ + uint64_t salt; /* salt to stir into hash function */ + uint64_t normflags; /* flags for u8_textprep_str() */ + uint64_t flags; /* zap_flags_t */ + + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +}; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ + +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ + +#define ZAP_LEAF_NUMCHUNKS(l) (((1 << (l)->bs) - 2 * ZAP_LEAF_HASH_NUMENTRIES(l)) / ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) (((bytes) + ZAP_LEAF_ARRAY_BYTES - 1) / ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ + +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ + +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ + +#define ZAP_LEAF_CHUNK(l, idx) ((zap_leaf_chunk_t *) ((l)->phys->hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).entry) + +enum zap_chunk_type +{ + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +}; + +#define ZLF_ENTRIES_CDSORTED (1<<0) + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ + +struct zap_leaf_phys_t +{ + uint64_t block_type; /* ZBT_LEAF */ + uint64_t pad1; + uint64_t prefix; /* hash prefix of this leaf */ + uint32_t magic; /* ZAP_LEAF_MAGIC */ + uint16_t nfree; /* number free chunks */ + uint16_t nentries; /* number of entries */ + uint16_t prefix_len; /* num bits used to id this */ + + /* above is accessable to zap, below is zap_leaf private */ + + uint16_t freelist; /* chunk head of free list */ + uint8_t flags; /* ZLF_* flags */ + uint8_t pad2[11]; + + /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t hash[1]; +}; + +struct zap_leaf_entry_t +{ + uint8_t type; /* always ZAP_CHUNK_ENTRY */ + uint8_t value_intlen; /* size of value's ints */ + uint16_t next; /* next entry in hash chain */ + uint16_t name_chunk; /* first chunk of the name */ + uint16_t name_numints; /* ints in name (incl null) */ + uint16_t value_chunk; /* first chunk of the value */ + uint16_t value_numints; /* value length in ints */ + uint32_t cd; /* collision differentiator */ + uint64_t hash; /* hash value of the name */ +}; + +struct zap_leaf_array_t +{ + uint8_t type; /* always ZAP_CHUNK_ARRAY */ + uint8_t buff[ZAP_LEAF_ARRAY_BYTES]; + uint16_t next; /* next blk or CHAIN_END */ +}; + +struct zap_leaf_free_t +{ + uint8_t type; /* always ZAP_CHUNK_FREE */ + uint8_t pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t next; /* next in free list, or CHAIN_END */ +}; + +struct bplist_phys_t +{ + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpl_entries blkptr_t's, representing + * a total of bpl_bytes physical space. + */ + + uint64_t entries; + uint64_t bytes; + uint64_t comp; + uint64_t uncomp; +}; + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ + +enum ddt_type +{ + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ + +enum ddt_class +{ + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ + +struct ddt_key_t +{ + cksum_t cksum; /* 256-bit block checksum */ + uint64_t prop; /* LSIZE, PSIZE, compression */ +}; + +/* + * ddk_prop layout: + * + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ + +#define DDT_KEY_WORDS (sizeof(ddt_key_t) / sizeof (uint64_t)) + +struct ddt_phys_t +{ + dva_t dva[SPA_DVAS_PER_BP]; + uint64_t refcnt; + uint64_t phys_birth; +}; + +enum ddt_phys_type +{ + DDT_PHYS_DITTO, + DDT_PHYS_SINGLE, + DDT_PHYS_DOUBLE, + DDT_PHYS_TRIPLE, + DDT_PHYS_TYPES +}; + +struct spa_history_phys_t +{ + uint64_t pool_create_len; /* ending offset of zpool create */ + uint64_t phys_max_off; /* physical EOF */ + uint64_t bof; /* logical BOF */ + uint64_t eof; /* logical EOF */ + uint64_t records_lost; /* num of records overwritten */ +}; + +#define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs are store in various forms. + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * thats needed. + */ + +struct zfs_ace_hdr_t +{ + uint16_t type; + uint16_t flags; + uint32_t access_mask; +}; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ + +struct zfs_ace_t +{ + zfs_ace_hdr_t hdr; + uint64_t fuid; +}; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ + +struct zfs_object_ace_t +{ + zfs_ace_t ace; + uint8_t object_type[16]; /* object type */ + uint8_t inherit_type[16]; /* inherited object type */ +}; + +struct zfs_oldace_t +{ + uint32_t fuid; /* "who" */ + uint32_t access_mask; /* access mask */ + uint16_t flags; /* flags, i.e inheritance */ + uint16_t type; /* type of entry allow/deny */ +}; + +struct zfs_acl_phys_v0_t +{ + uint64_t acl_extern_obj; /* ext acl pieces */ + uint32_t acl_count; /* Number of ACEs */ + uint16_t acl_version; /* acl version */ + uint16_t acl_pad; /* pad */ + zfs_oldace_t ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +}; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +struct zfs_acl_phys_t +{ + uint64_t acl_extern_obj; /* ext acl pieces */ + uint32_t acl_size; /* Number of bytes in ACL */ + uint16_t acl_version; /* acl version */ + uint16_t acl_count; /* ace count */ + uint8_t ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +}; + +/* + * Additional file level attributes, that are stored + * in the upper half of zp_flags + */ + +#define ZFS_READONLY 0x0000000100000000ULL +#define ZFS_HIDDEN 0x0000000200000000ULL +#define ZFS_SYSTEM 0x0000000400000000ULL +#define ZFS_ARCHIVE 0x0000000800000000ULL +#define ZFS_IMMUTABLE 0x0000001000000000ULL +#define ZFS_NOUNLINK 0x0000002000000000ULL +#define ZFS_APPENDONLY 0x0000004000000000ULL +#define ZFS_NODUMP 0x0000008000000000ULL +#define ZFS_OPAQUE 0x0000010000000000ULL +#define ZFS_AV_QUARANTINED 0x0000020000000000ULL +#define ZFS_AV_MODIFIED 0x0000040000000000ULL +#define ZFS_REPARSE 0x0000080000000000ULL + +#define ZFS_ATTR_SET(zp, attr, value) { if(value) zp->phys->flags |= attr; else zp->phys->flags &= ~attr; } + +/* + * Define special zfs pflags + */ + +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ + +/* + * Is ID ephemeral? + */ + +#define IS_EPHEMERAL(x) (x > MAXUID) + +/* + * Should we use FUIDs? + */ + +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + +#define MASTER_NODE_OBJ 1 + +/* + * Special attributes for master node. + * "userquota@" and "groupquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). + */ + +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" + +#define ZFS_MAX_BLOCKSIZE SPA_MAXBLOCKSIZE + +/* Path component length */ + +/* + * The generic fs code uses MAXNAMELEN to represent + * what the largest component length is. Unfortunately, + * this length includes the terminating NULL. ZFS needs + * to tell the users via pathconf() and statvfs() what the + * true maximum length of a component is, excluding the NULL. + */ + +#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) + +/* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. + */ + +#ifndef IFTODT +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) +#endif + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ + +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ + +struct znode_phys_t +{ + uint64_t atime[2]; /* 0 - last file access time */ + uint64_t mtime[2]; /* 16 - last file modification time */ + uint64_t ctime[2]; /* 32 - last file change time */ + uint64_t crtime[2]; /* 48 - creation time */ + uint64_t gen; /* 64 - generation (txg of creation) */ + uint64_t mode; /* 72 - file mode bits */ + uint64_t size; /* 80 - size of file */ + uint64_t parent; /* 88 - directory parent (`..') */ + uint64_t links; /* 96 - number of links to file */ + uint64_t xattr; /* 104 - DMU object for xattrs */ + uint64_t rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t flags; /* 120 - persistent flags */ + uint64_t uid; /* 128 - file owner */ + uint64_t gid; /* 136 - owning group */ + uint64_t zap; /* 144 - extra attributes */ + uint64_t pad[3]; /* 152 - future */ + zfs_acl_phys_t acl; /* 176 - 263 ACL */ + + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +}; + +struct raidz_col_t +{ + uint64_t devidx; /* child device index for I/O */ + uint64_t offset; /* device offset */ + uint64_t size; /* I/O size */ +}; + +class raidz_map_t +{ +public: + uint32_t m_cols; /* Regular column count */ + uint32_t m_scols; /* Count including skipped columns */ + uint32_t m_bigcols; /* Number of oversized columns */ + uint32_t m_firstdatacol; /* First data column/parity count */ + uint64_t m_nskip; /* Skipped sectors for padding */ + uint32_t m_skipstart; /* Column index of padding start */ + uint64_t m_asize; /* Actual total I/O size */ + std::vector m_col; /* Flexible array of I/O columns */ + +public: + raidz_map_t(uint64_t offset, uint64_t psize, uint32_t ashift, uint32_t dcols, uint32_t nparity) + : m_cols(dcols) + , m_scols(dcols) + { + uint64_t b = offset >> ashift; + uint64_t s = psize >> ashift; + uint32_t f = (uint32_t)(b % dcols); + uint64_t o = (b / dcols) << ashift; + uint64_t q = s / (dcols - nparity); + uint32_t r = (uint32_t)(s - q * (dcols - nparity)); + uint32_t bc = (r == 0 ? 0 : r + nparity); + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + if(q == 0) + { + m_cols = bc; + m_scols = std::min(dcols, roundup(bc, nparity + 1)); + } + + // ASSERT3U(m_cols, <=, m_scols); + + m_bigcols = bc; + m_skipstart = bc; + m_firstdatacol = nparity; + + m_col.resize(m_scols); + + uint64_t asize = 0; + + for(uint32_t c = 0; c < m_scols; c++) + { + uint32_t col = f + c; + uint64_t coff = o; + + if(col >= dcols) + { + col -= dcols; + coff += 1ULL << ashift; + } + + m_col[c].devidx = col; + m_col[c].offset = coff; + + if(c >= m_cols) + { + m_col[c].size = 0; + } + else if(c < bc) + { + m_col[c].size = (q + 1) << ashift; + } + else + { + m_col[c].size = q << ashift; + } + + asize += m_col[c].size; + } + + m_asize = roundup(asize, (nparity + 1) << ashift); + m_nskip = roundup(tot, nparity + 1) - tot; + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. + */ + + if(m_firstdatacol == 1 && (offset & (1ULL << 20))) + { + uint64_t devidx = m_col[0].devidx; + uint64_t offset = m_col[0].offset; + m_col[0].devidx = m_col[1].devidx; + m_col[0].offset = m_col[1].offset; + m_col[1].devidx = devidx; + m_col[1].offset = offset; + + if(m_skipstart == 0) + { + m_skipstart = 1; + } + } + } +}; + +#pragma pack(pop)