Upgrading from 6x 500GB RAID5 to 4x 2TB RAID6
Shutdown server and move to test bench
Prepare new disks
Disconnect old disks
Connect 4 new disks
Enable SMART
-
Samsung disks had SMART disabled by default
# for i sd{b..e} ; do smartctl -s on /dev/$i ; done
Random write
- Suse FDE
- Default cipher and mode is "aes-cbc-essiv:sha256"
-
Default keysize is 128-bits
# for i in sd{b..e} ; do dd_rescue -m 10M /dev/urandom /dev/$i ; done # for i in sd{b..e} ; do echo -n $TEST_PW | cryptsetup luksFormat --key-file=- /dev/$i ; done # for i in sd{b..e} ; do echo -n $TEST_PW | cryptsetup luksOpen --key-file=- /dev/$i crypt-$i ; done # for i in sd{b..e} ; do dd_rescue /dev/zero /dev/mapper/crypt-$i & done # for i in sd{b..e} ; do cryptsetup luksClose /dev/$i crypt-$i ; done
NOTE: dd_rescue doesn't stop when it reaches the end of the output device!
Benchmark
New Kernel
- Compile new kernel for missing crypto modules
Build and install XTS
$ echo "CONFIG_CRYPTO_XTS=m" >> .config
$ make modules
# cp crypto/xts.ko /lib/modules/2.6.31-pmp/kernel/crypto/
# depmod -A
# modprobe xts
# mount -o remount,rw /boot
# cp .config /boot/config-2.6.31-pmp
# mount -o remount,ro /boot
Partition types
- Update parted to 1.8.8
-
DOS partition
# cat > sfdisk.tempraid0.format ,1024,fd ,2048,fd ^D # for dev in {b..e} ; do cat sfdisk.tempraid0.format | sfdisk -uM /dev/sd$dev ; done
-
GPT partition (annoying buggy parted means manual label creation)
# for dev in {b..e} ; do parted /dev/sd$dev mklabel ; done # for dev in {b..e} ; do parted /dev/sd$dev mkpart primary 0 1024 mkpart primary 1024 3072 ; done
-
Arrays
# mdadm --create --verbose --metadata=1.2 --level=raid0 --raid-devices=4 /dev/md66 /dev/sd{b,c,d,e}1 # mdadm --create --verbose --assume-clean --metadata=1.2 --level=raid6 --raid-devices=4 /dev/md67 /dev/sd{b,c,d,e}2 # for dev in md66 md67 ; do mkfs.ext4 -m 0 /dev/$dev && mkdir -p /mnt/$dev && mount /dev/$dev /mnt/$dev ; done
-
Results (some 3908MB, some 3750MB)
# bonnie++ -f -d /mnt/md66 -s 3750 -n 0 -u root Version 1.03d ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP msdos,3908M,,,374639,44,110712,13,,,317465,19,372.9,0,,,,,,,,,,,,, msdos,3908M,,,400310,49,107278,14,,,343700,19,366.2,0,,,,,,,,,,,,, gpt,3750M,,,373408,44,106484,13,,,335955,22,383.3,0,,,,,,,,,,,,, gpt-raid0,3750M,,,379201,45,105965,13,,,334808,19,379.7,0,,,,,,,,,,,,, gpt-raid6,3750M,,,129930,19,47485,6,,,120628,8,360.5,0,,,,,,,,,,,,, # bonnie++ -f -d /mnt/md67 -s 3750 -n 0 -u root
No obvious difference for RAID0
-
Stop
# for dev in md66 md67 ; do umount /mnt/$dev ; mdadm --stop /dev/$dev ; done
Cipher and RAID Benchmarks
CHUNK, CRYPT_KEY_SIZE, CIPHER, NCQ
64, 256, lrw-benbi, 1
hastur,3750M,,,77502,10,29570,4,,,76041,5,328.2,1,,,,,,,,,,,,,
hastur,3750M,,,77913,11,29512,4,,,75647,6,322.5,0,,,,,,,,,,,,,
hastur,3750M,,,77937,10,29149,4,,,76300,5,322.9,0,,,,,,,,,,,,,
64, 256, cbc-essiv, 1
hastur,3750M,,,77501,10,29415,4,,,75281,5,330.2,1,,,,,,,,,,,,,
hastur,3750M,,,77880,10,28867,4,,,76466,6,326.1,1,,,,,,,,,,,,,
hastur,3750M,,,78451,10,29881,4,,,73462,5,330.0,1,,,,,,,,,,,,,
64, 256, xts-plain, 1
hastur,3750M,,,76612,10,28949,4,,,76289,5,309.5,0,,,,,,,,,,,,,
hastur,3750M,,,78131,10,29525,4,,,76528,5,317.0,1,,,,,,,,,,,,,
hastur,3750M,,,77624,10,29122,4,,,76401,5,322.7,1,,,,,,,,,,,,,
64, 512, lrw-benbi, 1
hastur,3750M,,,76133,11,28562,4,,,71292,5,289.9,0,,,,,,,,,,,,,
hastur,3750M,,,76470,10,27983,4,,,70690,5,312.1,0,,,,,,,,,,,,,
hastur,3750M,,,74969,10,28450,4,,,69769,5,299.2,1,,,,,,,,,,,,,
64, 512, cbc-essiv, 1
hastur,3750M,,,76839,10,27509,4,,,71383,5,292.8,0,,,,,,,,,,,,,
hastur,3750M,,,74941,10,28484,4,,,71224,5,283.4,1,,,,,,,,,,,,,
hastur,3750M,,,75474,10,29006,4,,,74678,5,307.6,1,,,,,,,,,,,,,
64, 512, xts-plain, 1
hastur,3750M,,,76980,10,28483,4,,,77147,6,321.1,0,,,,,,,,,,,,,
hastur,3750M,,,78038,10,28926,4,,,75617,5,331.4,0,,,,,,,,,,,,,
hastur,3750M,,,77566,11,29640,4,,,73846,5,326.2,0,,,,,,,,,,,,,
128, 256, lrw-benbi, 1
hastur,3750M,,,76583,10,27767,4,,,78447,6,328.8,0,,,,,,,,,,,,,
hastur,3750M,,,76805,10,28012,4,,,78318,5,325.3,1,,,,,,,,,,,,,
hastur,3750M,,,75651,10,28269,4,,,80328,5,319.7,0,,,,,,,,,,,,,
128, 256, cbc-essiv, 1
hastur,3750M,,,73652,10,27561,4,,,76043,5,312.7,0,,,,,,,,,,,,,
hastur,3750M,,,74091,10,27321,4,,,74952,5,302.3,0,,,,,,,,,,,,,
hastur,3750M,,,74929,10,26958,4,,,75964,5,308.0,1,,,,,,,,,,,,,
128, 256, xts-plain, 1
hastur,3750M,,,73128,10,27043,3,,,74821,5,316.5,0,,,,,,,,,,,,,
hastur,3750M,,,73468,10,26747,3,,,73302,5,292.7,0,,,,,,,,,,,,,
hastur,3750M,,,73360,10,26452,3,,,72755,5,297.9,0,,,,,,,,,,,,,
128, 512, lrw-benbi, 1
hastur,3750M,,,74233,10,27291,4,,,74660,5,310.1,1,,,,,,,,,,,,,
hastur,3750M,,,74540,10,26414,4,,,73114,5,288.7,1,,,,,,,,,,,,,
hastur,3750M,,,73015,9,27081,3,,,73988,5,303.1,0,,,,,,,,,,,,,
128, 512, cbc-essiv, 1
hastur,3750M,,,74490,10,27901,4,,,74019,5,292.8,0,,,,,,,,,,,,,
hastur,3750M,,,75435,10,27576,4,,,73163,5,308.6,0,,,,,,,,,,,,,
hastur,3750M,,,74645,10,27287,4,,,73003,5,316.2,1,,,,,,,,,,,,,
128, 512, xts-plain, 1
hastur,3750M,,,75475,10,28163,4,,,78059,6,340.1,1,,,,,,,,,,,,,
hastur,3750M,,,75484,10,27796,4,,,76429,5,321.1,0,,,,,,,,,,,,,
hastur,3750M,,,75956,10,27038,4,,,75931,5,311.3,1,,,,,,,,,,,,,
- NCQ=1 is significantly faster
- sdc and sde doing significantly more work
Fix IO load problem
Even after rearranging disks in the array, the same disks (Samsungs) have roughly double the TPS on writes. chunk=64 :: key_size=512 :: cipher=aes-xts-plain :: RA /dev/mapper/crypt-md67=256 /dev/md67=512 raw=128 hastur,3750M,,,70425,9,26040,3,,,63046,4,295.2,0,,,,,,,,,,,,,
RAID5, for comparison hastur,3750M,,,80460,10,29684,4,,,76040,5,270.2,0,,,,,,,,,,,,,
RAID0, for comparison hastur,3750M,,,99602,12,40017,6,,,94923,7,291.6,0,,,,,,,,,,,,,
-
Taking dmcrypt out of the equation evens out IO load. It's misalignment of dmcrypt/ext4, not the Samsungs.
# cat bench.raid0.nocrypt.chunk32.out hastur,3750M,,,429599,49,98253,12,,,310307,22,320.6,0,,,,,,,,,,,,, hastur,3750M,,,418548,49,100704,13,,,345801,24,309.8,0,,,,,,,,,,,,, # cat bench.raid0.nocrypt.chunk64.out hastur,3750M,,,426720,51,103288,13,,,304494,19,337.2,0,,,,,,,,,,,,, hastur,3750M,,,420937,50,105754,13,,,341867,19,333.3,0,,,,,,,,,,,,, # cat bench.raid0.crypt_align256.chunk64.out hastur,3750M,,,99446,11,40287,6,,,98194,7,308.0,0,,,,,,,,,,,,, hastur,3750M,,,98978,12,40615,6,,,100049,7,307.4,0,,,,,,,,,,,,,
Mitigated by setting readahead and stripe cache, it seems
Prepare new array
Alignments
- Partitions to HDD sectors: 512B or 4kB)
- RAID chunks to HDD sectors: 4kB
- dmcrypt sectors to RAID stripes: max_stripe_width * chunk_size
- LVM sectors to ?
- ext4 sectors to ?
Partition
Alignment
- Necessary only if 4kB disks are used, otherwise parted >=1.7 automatically aligns to physical sector boundaries
Layout
- 300GB RAID0 (75GB per disk)
-
3850GB RAID6 (1925GB per disk)
# for dev in {b..e} ; do parted /dev/sd$dev mklabel ; done Warning: The existing disk label on /dev/sdb will be destroyed and all data on this disk will be lost. Do you want to continue? Yes/No? y New disk label type? [gpt]? ... # for dev in {b..e} ; do parted /dev/sd$dev -- mkpart primary 0 76800 mkpart primary 76800 -0 ; done
Buggy parted cli means creating labels manually.
- -- argument lets you specify -0 on command line
- Check with "unit s print" for dev in {b..e} ; do parted /dev/sd$dev unit s print ; done
Decided against partitioning.
- RAID0 isn't expandable
- LVM will replace the separation from partitioning
RAID
Alignment
- With bitmap, default offset is 136 sectors (68Kb)
- Check with mdadm -E
- Data offsets: 4GB raid0 16 sectors (8kB) , 4GB raid6 24 sectors (12kB)
- No alignment necessary since offset is multiple of physical sector size (512b or 4kB)
Chunk Size
-
http://www.zdnet.com/blog/storage/chunks-the-hidden-key-to-raid-performance/130
- Small chunks for: few large I/O requests -> increased bandwidth
- Big chunks for: many small I/O requests (DB) -> increase IOPS (one disk per request)
# mdadm --create --metadata=1.2 --verbose --chunk 64 --level=raid6 --raid-devices=4 /dev/md6 /dev/sd{b..e}
Encryption
Alignment
- http://kerneltrap.org/mailarchive/linux-raid/2010/1/4/6683163
- --align-payload=value, in 512-byte sectors. Align to full stripe boundaries.
- Default alignment is 4040 = (2020k).
- Align to maximum planned stripe-width = (8-2) x 64k = 384k = 768sectors
- Checking the alignment cryptsetup luksDump /dev/md1
Cipher
- aes-cbc-essiv vs aes-xts-plain
- aes-xts needs double keysize to feed equal parts to aes and xts
- aes-xts has no ESSIV so no :hash is specified
- Default luksFormat password hash is sha1. It's not vulnerable in the same way as signed certs are.
- Specifying --hash for luksFormat is supported but seems largely unnecessary
-
2048 is safe offset
# cryptsetup -c aes-xts-plain -s 512 --align-payload=2048 luksFormat /dev/md6 # cryptsetup luksOpen /dev/md6 crypt-md6
Logical Volumes
Alignment
- Theodore Ts'o
- http://www.mail-archive.com/linux-raid@vger.kernel.org/msg09685.html
- LVM auto aligned on RAID. But on dmcrypt?
- Want to align the LVM data to RAID chunk boundaries.
For chunk sizes >= 128K, subtract 63k pvcreate --metadatasize 193k /dev/mapper/crypt-md6 # pads up to next 64KB boundary
-
Check alignment
# pvs /dev/sdb2 -o+pe_start /dev/dm-6 lvm2 -- 3.64T 3.64T 256.00K
Layout
- Home, media - separate to allow home quotas to be configured
PV
pvcreate --metadatasize 193k /dev/mapper/crypt-md6
VG
vgcreate vg-md6 /dev/mapper/crypt-md6
LV
lvcreate -n media vg-md6 -L2500G
lvcreate -n home vg-md6 -L500G
Filesystems
- EXT4
Resize reservation
- -E resize=
- tune2fs -l lists max fs blocks. (Default allows for 16TB)
Bytes per Inode
- -i 65536 (reduces inode overhead, default is 16384)
Stride and Stripe
- http://www.ep.ph.bham.ac.uk/general/support/raid/raidperf11.html
- http://busybox.net/~aldot/mkfs_stride.html
- Stride controls the space between metadata blocks. If stride == RAID chunk size there would be a bitmap in every chunk. Worst case would be a bitmap every stripe_width which would put all the bitmaps on a single disk.
Stripe-width lets the FS calculate the number of disks allowing parallel IOs stride(64k) = raid_chunk_size / ext4_block_size = 64k / 4k = 16 stripe_width(4) = raid_data_disks * stride = (4-2) * 16 = 32
-
Resizing an array
tune2fs -E stripe-width=$NEW_STRIPE_WIDTH resize2fs
Create Home
-
No reserved space. Stride and stripe-width for 2 data disks and 64k chunk.
# mkfs.ext4 -m 0 -E stride=16,stripe-width=32 /dev/vg-md6/home
Create Media
-
No reserved space. 64k per inode. Stride and stripe-width for 2 data disks and 64k chunk.
# mkfs.ext4 -m 0 -i 65536 -E stride=16,stripe-width=32 /dev/vg-md6/media
fstab
/dev/vg-md6/home /mnt/md6-home ext4 defaults,noatime,nosuid,noauto,acl 0 3
/dev/vg-md6/media /mnt/md6-media ext4 defaults,noatime,nosuid,noauto,acl 0 3
/dev/vg-md6/home /home ext4 defaults,noatime,nosuid,noauto 0 3
crypttab
/dev/mapper/crypt-md6 /dev/md6 none luks
Copy Data
Start new array in 3-of-4 disk degraded state
```
# mdadm --assemble --run /dev/md6 /dev/sd{h,i,j}
# cryptsetup luksOpen /dev/md6 crypt-md6
# vgchange -a y vg-md6
# mount /mnt/md6-media
# mount /mnt/md6-home
```
Start old array
```
# mdadm --assemble /dev/md2
# mdadm --assemble /dev/md3
# cryptsetup luksOpen /dev/md2 crypt-md2
# cryptsetup luksOpen /dev/md3 crypt-md3
# mount /dev/mapper/crypt-md2
# mount /dev/mapper/crypt-md3
```
Copy data
Reconfigure
rc.local
- Clear out old script
Samba
# sed -i -e 's%md3/media%md6-media%g' /etc/samba/smb.conf
- Replace valid users lists with groups: [mediasys] ... valid users = @group-name
SNMP
- Add disk space graphs in cacti
- Had to restart snmpd to update GetMountedPartitions query
Mediatomb
# vim /etc/mediatomb/config.xml
<home>/mnt/md6-media/metadata/mediatomb</home>
mtdaapd
# sed -i -e 's%md3/media%md6-media%g' /etc/mtdaapd.conf
Switch Services
mediasys
# for mntpt in /export/mediasys/media/{movies,videos,music,tvshows,photos}/{library,meta} ; do umount $mntpt ; done
# for mntpt in $(grep '^[^#]*md6-media.*bind' /etc/fstab | cut -d' ' -f1) ; do mount $mntpt ; done
Sync RAID
Stop old array
Add 4th new disk
# mdadm --manage /dev/md6 --add /dev/sdb
Sync new array
Benchmark
-
Post sync. Read performance doesn't look right
# bonnie++ -q -f -x 3 -s 3750 -n 0 -u root -d /mnt/md6-media name,file_size,putc,putc_cpu,put_block,put_block_cpu,rewrite,rewrite_cpu,getc,getc_cpu,get_block,get_block_cpu,seeks,seeks_cpu,num_files,seq_create,seq_create_cpu,seq_stat,seq_stat_cpu,seq_del,seq_del_cpu,ran_create,ran_create_cpu,ran_stat,ran_stat_cpu,ran_del,ran_del_cpu hastur,3750M,,,80818,11,24548,4,,,56356,5,305.6,0,,,,,,,,,,,,, hastur,3750M,,,86100,12,24631,4,,,59027,5,305.8,0,,,,,,,,,,,,, hastur,3750M,,,87435,11,24239,4,,,59217,5,312.1,0,,,,,,,,,,,,,