Index: linux-2.6/fs/ext4/ext4.h =================================================================== --- linux-2.6.orig/fs/ext4/ext4.h 2008-06-05 13:44:20.521046407 -0500 +++ linux-2.6/fs/ext4/ext4.h 2008-07-05 12:51:24.698292740 -0500 @@ -22,7 +22,7 @@ #include "ext4_i.h" /* - * The second extended filesystem constants/structures + * The fourth extended filesystem constants/structures */ /* @@ -74,6 +74,9 @@ #define EXT4_MB_HINT_GOAL_ONLY 256 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 512 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 1024 + struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -94,6 +97,11 @@ struct ext4_allocation_request { unsigned long len; /* flags. see above EXT4_MB_HINT_* */ unsigned long flags; + /* + * for ext4 online defrag: + * the block group which is excepted from allocation target + */ + long long excepted_group; }; /* @@ -170,6 +178,15 @@ struct ext4_group_desc __u32 bg_reserved2[3]; }; +/* + * Structure of a flex block group info + */ + +struct flex_groups { + __u32 free_inodes; + __u32 free_blocks; +}; + #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ @@ -289,6 +306,14 @@ struct ext4_new_group_data { #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) #define EXT4_IOC_MIGRATE _IO('f', 7) +#define EXT4_IOC_FIBMAP _IOW('f', 9, ext4_fsblk_t) +#define EXT4_IOC_DEFRAG _IOW('f', 10, struct ext4_ext_defrag_data) +#define EXT4_IOC_GROUP_INFO _IOW('f', 11, struct ext4_group_data_info) +#define EXT4_IOC_FREE_BLOCKS_INFO _IOW('f', 12, struct ext4_extents_info) +#define EXT4_IOC_EXTENTS_INFO _IOW('f', 13, struct ext4_extents_info) +#define EXT4_IOC_RESERVE_BLOCK _IOW('f', 14, struct ext4_extents_info) +#define EXT4_IOC_MOVE_VICTIM _IOW('f', 15, struct ext4_extents_info) +#define EXT4_IOC_BLOCK_RELEASE _IO('f', 8) /* * ioctl commands in 32 bit emulation @@ -306,6 +331,57 @@ struct ext4_new_group_data { #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +/* + * Will go away. + * ext4 online defrag supports only 4KB block size. + */ +#define DEFRAG_BLOCK_SIZE 4096 + +/* + * The following four macros are used for the defrag force mode. + * + * DEFRAG_MAX_ENT: the maximum number of extents for exchanging between + * kernel-space and user-space per an ioctl + * DEFRAG_FORCE_TRY: check whether we have free space fragmentation or not + * DEFRAG_FORCE_VICTIM: move victim extents to make sufficient space + * DEFRAG_FORCE_GATHER: move the target file into the free space made in the + * DEFRAG_FORCE_VICTIM phase + */ +#define DEFRAG_MAX_ENT 32 +#define DEFRAG_FORCE_TRY 1 +#define DEFRAG_FORCE_VICTIM 2 +#define DEFRAG_FORCE_GATHER 3 + +struct ext4_extent_data { + ext4_lblk_t block; /* start logical block number */ + ext4_fsblk_t start; /* start physical block number */ + int len; /* blocks count */ +}; + +struct ext4_ext_defrag_data { + ext4_lblk_t start_offset; /* start offset to defrag in blocks */ + ext4_lblk_t defrag_size; /* size of defrag in blocks */ + ext4_fsblk_t goal; /* block offset for allocation */ + int flag; /* free space mode flag */ + struct ext4_extent_data ext; +}; + +struct ext4_group_data_info { + int s_blocks_per_group; /* blocks per group */ + int s_inodes_per_group; /* inodes per group */ +}; + +struct ext4_extents_info { + unsigned long long ino; /* inode number */ + int max_entries; /* maximum extents count */ + int entries; /* extent number/count */ + ext4_lblk_t f_offset; /* file offset */ + ext4_grpblk_t g_offset; /* group offset */ + ext4_fsblk_t goal; /* block offset for allocation */ + struct ext4_extent_data ext[DEFRAG_MAX_ENT]; +}; + +#define EXT4_TRANS_META_BLOCKS 4 /* bitmap + group desc + sb + inode */ /* * Mount options @@ -383,6 +459,8 @@ struct ext4_inode { __le32 i_version_hi; /* high 32 bits for 64-bit version */ }; +#define EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP (FIEMAP_FLAG_INCOMPAT & \ + ~(FIEMAP_FLAG_LUN_OFFSET)) #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) @@ -527,6 +605,7 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt @@ -647,7 +726,10 @@ struct ext4_super_block { __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u32 s_reserved[163]; /* Padding to the end of the block */ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -958,12 +1040,17 @@ extern ext4_grpblk_t ext4_block_group_of extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, int *errp); -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks); extern void ext4_free_blocks (handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, @@ -977,6 +1064,17 @@ extern struct ext4_group_desc * ext4_get extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern void ext4_init_block_alloc_info(struct inode *); extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv); +extern void try_to_extend_reservation(struct ext4_reserve_window_node *, + struct super_block *, int); +extern int alloc_new_reservation(struct ext4_reserve_window_node *, + ext4_grpblk_t, struct super_block *, + ext4_group_t, struct buffer_head *); +extern ext4_grpblk_t bitmap_search_next_usable_block(ext4_grpblk_t, + struct buffer_head *, ext4_grpblk_t); +extern int rsv_is_empty(struct ext4_reserve_window *rsv); +extern int goal_in_my_reservation(struct ext4_reserve_window *rsv, + ext4_grpblk_t grp_goal, ext4_group_t group, + struct super_block *sb); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -986,6 +1084,7 @@ extern int ext4_htree_store_dirent(struc __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern sector_t ext4_bmap(struct address_space *mapping, sector_t block); /* fsync.c */ extern int ext4_sync_file (struct file *, struct dentry *, int); @@ -1016,9 +1115,14 @@ extern int __init init_ext4_mballoc(void extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); +extern int ext4_mb_add_more_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_mb_update_group_info(struct ext4_group_info *grp, + ext4_grpblk_t add); /* inode.c */ +void ext4_da_release_space(struct inode *inode, int used, int to_free); int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, @@ -1044,8 +1148,11 @@ extern void ext4_set_inode_flags(struct extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_block_truncate_page(handle_t *handle, struct page *page, +extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -1094,6 +1201,14 @@ extern void ext4_inode_bitmap_set(struct struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); +/* extents.c */ +extern int ext4_ext_journal_restart(handle_t *handle, int needed); +/* defrag.c */ +extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start, + ext4_lblk_t defrag_size, ext4_fsblk_t goal, + int flag, struct ext4_extent_data *ext); +extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int, + unsigned long); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -1159,6 +1274,17 @@ struct ext4_group_info *ext4_get_group_i } +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -1191,7 +1317,7 @@ extern int ext4_ext_get_blocks(handle_t ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize); -extern void ext4_ext_truncate(struct inode *, struct page *); +extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, @@ -1199,7 +1325,8 @@ extern long ext4_fallocate(struct inode extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, - int extend_disksize); + int extend_disksize, int flag); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start); #endif /* __KERNEL__ */ #endif /* _EXT4_H */ Index: linux-2.6/fs/ext4/ext4_i.h =================================================================== --- linux-2.6.orig/fs/ext4/ext4_i.h 2008-06-05 13:44:20.522046081 -0500 +++ linux-2.6/fs/ext4/ext4_i.h 2008-07-05 12:51:23.558290927 -0500 @@ -79,7 +79,7 @@ struct ext4_ext_cache { }; /* - * third extended file system inode data in memory + * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ @@ -150,6 +150,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; + struct jbd2_inode jinode; unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; @@ -162,6 +163,13 @@ struct ext4_inode_info { /* mballoc */ struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + + /* allocation reservation info for delalloc */ + unsigned long i_reserved_data_blocks; + unsigned long i_reserved_meta_blocks; + unsigned long i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + spinlock_t i_block_reservation_lock; }; #endif /* _EXT4_I */ Index: linux-2.6/fs/ext4/ext4_sb.h =================================================================== --- linux-2.6.orig/fs/ext4/ext4_sb.h 2008-06-05 13:44:20.522046081 -0500 +++ linux-2.6/fs/ext4/ext4_sb.h 2008-07-05 12:51:22.196357153 -0500 @@ -25,7 +25,7 @@ #include /* - * third extended-fs super-block data in memory + * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ @@ -143,6 +143,9 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; }; #endif /* _EXT4_SB */ Index: linux-2.6/fs/ext4/mballoc.c =================================================================== --- linux-2.6.orig/fs/ext4/mballoc.c 2008-06-16 11:34:11.869008051 -0500 +++ linux-2.6/fs/ext4/mballoc.c 2008-07-05 12:51:24.558291107 -0500 @@ -803,6 +803,7 @@ static int ext4_mb_init_cache(struct pag if (!buffer_uptodate(bh[i])) goto out; + err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -883,6 +884,7 @@ ext4_mb_load_buddy(struct super_block *s int pnum; int poff; struct page *page; + int ret; mb_debug("load group %lu\n", group); @@ -914,15 +916,21 @@ ext4_mb_load_buddy(struct super_block *s if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -938,14 +946,20 @@ ext4_mb_load_buddy(struct super_block *s page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) - ext4_mb_init_cache(page, e4b->bd_bitmap); - + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -962,7 +976,7 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - return -EIO; + return ret; } static void ext4_mb_release_desc(struct ext4_buddy *e4b) @@ -1730,10 +1744,6 @@ ext4_mb_regular_allocator(struct ext4_al ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } - - /* searching for the right group start from the goal value specified */ - group = ac->ac_g_ex.fe_group; - /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -1743,6 +1753,12 @@ ext4_mb_regular_allocator(struct ext4_al repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; @@ -1750,6 +1766,10 @@ repeat: if (group == EXT4_SB(sb)->s_groups_count) group = 0; + if (ac->ac_excepted_group != -1 && + group == ac->ac_excepted_group) + continue; + /* quick check to skip empty groups */ grp = ext4_get_group_info(ac->ac_sb, group); if (grp->bb_free == 0) @@ -1963,6 +1983,8 @@ static int ext4_mb_seq_history_open(stru int rc; int size; + if (unlikely(sbi->s_mb_history == NULL)) + return -ENOMEM; s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) return -ENOMEM; @@ -2165,9 +2187,7 @@ static void ext4_mb_history_init(struct sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kmalloc(i, GFP_KERNEL); - if (likely(sbi->s_mb_history != NULL)) - memset(sbi->s_mb_history, 0, i); + sbi->s_mb_history = kzalloc(i, GFP_KERNEL); /* if we can't allocate history, then we simple won't use it */ } @@ -2215,21 +2235,192 @@ ext4_mb_store_history(struct ext4_alloca #define ext4_mb_history_init(sb) #endif + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i, len; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + /* + * calculate needed size. if change bb_counters size, + * don't forget about ext4_mb_generate_buddy() + */ + len = offsetof(typeof(**meta_group_info), + bb_counters[sb->s_blocksize_bits + 2]); + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kzalloc(len, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_blocks_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +/* + * Add a group to the existing groups. + * This function is used for online resize + */ +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + int err; + + /* Add group based on group descriptor*/ + err = ext4_mb_add_groupinfo(sb, group, desc); + if (err) + return err; + + /* + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap + * datas) are set not up to date so that they will be re-initilaized + * during the next call to ext4_mb_load_buddy + */ + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + return 0; +} + +/* + * Update an existing group. + * This function is used for online resize + */ +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) +{ + grp->bb_free += add; +} + static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t i; - int j, len, metalen; + int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); - int num_meta_group_infos = - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> - EXT4_DESC_PER_BLOCK_BITS(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; struct ext4_group_info **meta_group_info; + struct ext4_group_desc *desc; + + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * - num_meta_group_infos, GFP_KERNEL); + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -2256,63 +2447,15 @@ static int ext4_mb_init_backend(struct s sbi->s_group_info[i] = meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = sizeof(struct ext4_group_info); - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); for (i = 0; i < sbi->s_groups_count; i++) { - struct ext4_group_desc *desc; - - meta_group_info = - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); - - meta_group_info[j] = kzalloc(len, GFP_KERNEL); - if (meta_group_info[j] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); - goto err_freebuddy; - } desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR "EXT4-fs: can't read descriptor %lu\n", i); - i++; goto err_freebuddy; } - memset(meta_group_info[j], 0, len); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, - &(meta_group_info[j]->bb_state)); - - /* - * initialize bb_free to be able to skip - * empty groups without initialization - */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - meta_group_info[j]->bb_free = - ext4_free_blocks_after_init(sb, i, desc); - } else { - meta_group_info[j]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); - } - - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); - -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[j]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); - BUG_ON(meta_group_info[j]->bb_bitmap == NULL); - bh = read_block_bitmap(sb, i); - BUG_ON(bh == NULL); - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; } return 0; @@ -2336,6 +2479,7 @@ int ext4_mb_init(struct super_block *sb, unsigned i; unsigned offset; unsigned max; + int ret; if (!test_opt(sb, MBALLOC)) return 0; @@ -2370,12 +2514,12 @@ int ext4_mb_init(struct super_block *sb, } while (i <= sb->s_blocksize_bits + 1); /* init file for buddy data */ - i = ext4_mb_init_backend(sb); - if (i) { + ret = ext4_mb_init_backend(sb); + if (ret != 0) { clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - return i; + return ret; } spin_lock_init(&sbi->s_md_lock); @@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct sup -#define MB_PROC_VALUE_READ(name) \ -static int ext4_mb_read_##name(char *page, char **start, \ - off_t off, int count, int *eof, void *data) \ +#define MB_PROC_FOPS(name) \ +static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ { \ - struct ext4_sb_info *sbi = data; \ - int len; \ - *eof = 1; \ - if (off != 0) \ - return 0; \ - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ - *start = page; \ - return len; \ -} - -#define MB_PROC_VALUE_WRITE(name) \ -static int ext4_mb_write_##name(struct file *file, \ - const char __user *buf, unsigned long cnt, void *data) \ + struct ext4_sb_info *sbi = m->private; \ + \ + seq_printf(m, "%ld\n", sbi->s_mb_##name); \ + return 0; \ +} \ + \ +static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ +{ \ + return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ +} \ + \ +static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ + const char __user *buf, size_t cnt, loff_t *ppos) \ { \ - struct ext4_sb_info *sbi = data; \ + struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ char str[32]; \ long value; \ if (cnt >= sizeof(str)) \ @@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct f return -ERANGE; \ sbi->s_mb_##name = value; \ return cnt; \ -} +} \ + \ +static const struct file_operations ext4_mb_##name##_proc_fops = { \ + .owner = THIS_MODULE, \ + .open = ext4_mb_##name##_proc_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + .write = ext4_mb_##name##_proc_write, \ +}; -MB_PROC_VALUE_READ(stats); -MB_PROC_VALUE_WRITE(stats); -MB_PROC_VALUE_READ(max_to_scan); -MB_PROC_VALUE_WRITE(max_to_scan); -MB_PROC_VALUE_READ(min_to_scan); -MB_PROC_VALUE_WRITE(min_to_scan); -MB_PROC_VALUE_READ(order2_reqs); -MB_PROC_VALUE_WRITE(order2_reqs); -MB_PROC_VALUE_READ(stream_request); -MB_PROC_VALUE_WRITE(stream_request); -MB_PROC_VALUE_READ(group_prealloc); -MB_PROC_VALUE_WRITE(group_prealloc); +MB_PROC_FOPS(stats); +MB_PROC_FOPS(max_to_scan); +MB_PROC_FOPS(min_to_scan); +MB_PROC_FOPS(order2_reqs); +MB_PROC_FOPS(stream_request); +MB_PROC_FOPS(group_prealloc); #define MB_PROC_HANDLER(name, var) \ do { \ - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ + proc = proc_create_data(name, mode, sbi->s_mb_proc, \ + &ext4_mb_##var##_proc_fops, sbi); \ if (proc == NULL) { \ printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ goto err_out; \ } \ - proc->data = sbi; \ - proc->read_proc = ext4_mb_read_##var ; \ - proc->write_proc = ext4_mb_write_##var; \ } while (0) static int ext4_mb_init_per_dev_proc(struct super_block *sb) @@ -2747,7 +2891,7 @@ ext4_mb_mark_diskspace_used(struct ext4_ err = -EIO; - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (!bitmap_bh) goto out_err; @@ -2816,7 +2960,23 @@ ext4_mb_mark_diskspace_used(struct ext4_ le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + + /* + * free blocks account has already be reduced/reserved + * at write_begin() time for delayed allocation + * do not double accounting + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + percpu_counter_sub(&sbi->s_freeblocks_counter, + ac->ac_b_ex.fe_len); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } err = ext4_journal_dirty_metadata(handle, bitmap_bh); if (err) @@ -3569,7 +3729,7 @@ ext4_mb_discard_group_preallocations(str if (list_empty(&grp->bb_prealloc_list)) return 0; - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { /* error handling here */ ext4_mb_release_desc(&e4b); @@ -3743,7 +3903,7 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); BUG_ON(err != 0); /* error handling here */ - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { /* error handling here */ ext4_mb_release_desc(&e4b); @@ -3934,6 +4094,7 @@ ext4_mb_initialize_context(struct ext4_a ac->ac_bitmap_page = NULL; ac->ac_buddy_page = NULL; ac->ac_lg = NULL; + ac->ac_excepted_group = ar->excepted_group; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ @@ -4011,10 +4172,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t sbi = EXT4_SB(sb); if (!test_opt(sb, MBALLOC)) { - block = ext4_new_blocks_old(handle, ar->inode, ar->goal, + block = ext4_old_new_blocks(handle, ar->inode, ar->goal, &(ar->len), errp); return block; } + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + ar->len = ext4_has_free_blocks(sbi, ar->len); + } + + if (ar->len == 0) { + *errp = -ENOSPC; + return 0; + } while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; @@ -4026,10 +4198,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t } inquota = ar->len; + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { + ar->len = 0; *errp = -ENOMEM; - return 0; + goto out1; } ext4_mb_poll_new_transaction(sb, handle); @@ -4037,12 +4213,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out; + goto out2; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; if (!ext4_mb_use_preallocated(ac)) { - ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); repeat: @@ -4085,11 +4260,12 @@ repeat: ext4_mb_release_context(ac); -out: +out2: + kmem_cache_free(ext4_ac_cachep, ac); +out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); - kmem_cache_free(ext4_ac_cachep, ac); return block; } static void ext4_mb_poll_new_transaction(struct super_block *sb, @@ -4242,7 +4418,7 @@ do_more: overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - bitmap_bh = read_block_bitmap(sb, block_group); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; gdp = ext4_get_group_desc(sb, block_group, &gd_bh); @@ -4321,6 +4497,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + ext4_mb_release_desc(&e4b); *freed += count; Index: linux-2.6/fs/ext4/balloc.c =================================================================== --- linux-2.6.orig/fs/ext4/balloc.c 2008-06-16 11:34:11.860451934 -0500 +++ linux-2.6/fs/ext4/balloc.c 2008-07-05 12:51:24.428291692 -0500 @@ -47,7 +47,7 @@ static int ext4_block_in_group(struct su ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, 0); + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); if (actual_group == block_group) return 1; return 0; @@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct s le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - int group_rel = (block_group - - le32_to_cpu(sbi->s_es->s_first_meta_bg)) % - EXT4_DESC_PER_BLOCK(sb); - if (group_rel == 0 || group_rel == 1 || - (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) - bit_max += 1; + bit_max += ext4_bg_num_gdb(sb, block_group); } if (block_group == sbi->s_groups_count - 1) { @@ -295,7 +290,7 @@ err_out: return 0; } /** - * read_block_bitmap() + * ext4_read_block_bitmap() * @sb: super block * @block_group: given block group * @@ -305,7 +300,7 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc * desc; struct buffer_head * bh = NULL; @@ -409,8 +404,7 @@ restart: prev = rsv; } printk("Window map complete.\n"); - if (bad) - BUG(); + BUG_ON(bad); } #define rsv_window_dump(root, verbose) \ __rsv_window_dump((root), (verbose), __func__) @@ -434,7 +428,7 @@ restart: * If the goal block is within the reservation window, return 1; * otherwise, return 0; */ -static int +int goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, ext4_group_t group, struct super_block *sb) { @@ -539,7 +533,7 @@ void ext4_rsv_window_add(struct super_bl * from the filesystem reservation window rb tree. Must be called with * rsv_lock hold. */ -static void rsv_window_remove(struct super_block *sb, +void rsv_window_remove(struct super_block *sb, struct ext4_reserve_window_node *rsv) { rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; @@ -554,7 +548,7 @@ static void rsv_window_remove(struct sup * * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED. */ -static inline int rsv_is_empty(struct ext4_reserve_window *rsv) +inline int rsv_is_empty(struct ext4_reserve_window *rsv) { /* a valid reservation end block could not be 0 */ return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED; @@ -694,7 +688,7 @@ do_more: count -= overflow; } brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, block_group); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; desc = ext4_get_group_desc (sb, block_group, &gd_bh); @@ -810,6 +804,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_journal_dirty_metadata(handle, bitmap_bh); @@ -913,7 +914,7 @@ static int ext4_test_allocatable(ext4_gr * bitmap on disk and the last-committed copy in journal, until we find a * bit free in both bitmaps. */ -static ext4_grpblk_t +ext4_grpblk_t bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh, ext4_grpblk_t maxblocks) { @@ -1283,7 +1284,7 @@ static int find_next_reservable_window( * @bitmap_bh: the block group block bitmap * */ -static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, +int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, ext4_grpblk_t grp_goal, struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap_bh) { @@ -1427,7 +1428,7 @@ retry: * expand the reservation window size if necessary on a best-effort * basis before ext4_new_blocks() tries to allocate blocks, */ -static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, +void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, struct super_block *sb, int size) { struct ext4_reserve_window_node *next_rsv; @@ -1598,23 +1599,35 @@ out: /** * ext4_has_free_blocks() - * @sbi: in-core super block structure. + * @sbi: in-core super block structure. + * @nblocks: number of neeed blocks * - * Check if filesystem has at least 1 free block available for allocation. + * Check if filesystem has free blocks available for allocation. + * Return the number of blocks avaible for allocation for this request + * On success, return nblocks */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi) +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks) { - ext4_fsblk_t free_blocks, root_blocks; + ext4_fsblk_t free_blocks; + ext4_fsblk_t root_blocks = 0; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = ext4_r_blocks_count(sbi->s_es); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + + if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - return 0; - } - return 1; -} + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); +#ifdef CONFIG_SMP + if (free_blocks - root_blocks < FBC_BATCH) + free_blocks = + percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); +#endif + if (free_blocks - root_blocks < nblocks) + return free_blocks - root_blocks; + return nblocks; + } + /** * ext4_should_retry_alloc() @@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct e */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super } /** - * ext4_new_blocks_old() -- core block(s) allocation function + * ext4_old_new_blocks() -- core block bitmap based block allocation function + * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: target number of blocks to allocate * @errp: error code * - * ext4_new_blocks uses a goal block to assist allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If that - * fails, it will try to allocate block(s) from other block groups without - * any specific goal block. + * ext4_old_new_blocks uses a goal block to assist allocation and look up + * the block bitmap directly to do block allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If + * that fails, it will try to allocate block(s) from other block groups + * without any specific goal block. + * + * This function is called when -o nomballoc mount option is enabled * */ -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, +ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { struct buffer_head *bitmap_bh = NULL; @@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ ext4_group_t ngroups; unsigned long num = *count; - *errp = -ENOSPC; sb = inode->i_sb; if (!sb) { + *errp = -ENODEV; printk("ext4_new_block: nonexistent device"); return 0; } + sbi = EXT4_SB(sb); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + *count = ext4_has_free_blocks(sbi, *count); + } + if (*count == 0) { + *errp = -ENOSPC; + return 0; /*return with ENOSPC error */ + } + num = *count; + /* * Check quota for allocation of this block. */ @@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext4_has_free_blocks(sbi)) { - *errp = -ENOSPC; - goto out; - } - /* * First, test whether the goal block is free. */ @@ -1734,7 +1759,7 @@ retry_alloc: my_rsv = NULL; if (free_blocks > 0) { - bitmap_bh = read_block_bitmap(sb, group_no); + bitmap_bh = ext4_read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, @@ -1770,7 +1795,7 @@ retry_alloc: continue; brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); + bitmap_bh = ext4_read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; /* @@ -1882,7 +1907,15 @@ allocated: le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group_no); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= num; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext4_journal_dirty_metadata(handle, gdp_bh); @@ -1915,46 +1948,104 @@ out: return 0; } -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) +#define EXT4_META_BLOCK 0x1 + +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp, int flags) { struct ext4_allocation_request ar; ext4_fsblk_t ret; if (!test_opt(inode->i_sb, MBALLOC)) { - unsigned long count = 1; - ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); - return ret; + return ext4_old_new_blocks(handle, inode, goal, count, errp); } memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; ar.goal = goal; - ar.len = 1; + ar.len = *count; + ar.logical = iblock; + + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) + /* enable in-core preallocation for data block allocation */ + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + ret = ext4_mb_new_blocks(handle, &ar, errp); + *count = ar.len; return ret; } -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { - struct ext4_allocation_request ar; ext4_fsblk_t ret; - - if (!test_opt(inode->i_sb, MBALLOC)) { - ret = ext4_new_blocks_old(handle, inode, goal, count, errp); - return ret; + ret = do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); + /* + * Account for the allocated meta blocks + */ + if (!(*errp)) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += *count; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } - - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; return ret; } +/* + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @errp: error code + * + * Return allocated block number on success + */ +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); +} + +/* + * ext4_new_blocks() -- allocate data blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ + +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp) +{ + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); +} /** * ext4_count_free_blocks() -- count filesystem free blocks @@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(stru continue; desc_count += le16_to_cpu(gdp->bg_free_blocks_count); brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, i); + bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; Index: linux-2.6/fs/ext4/dir.c =================================================================== --- linux-2.6.orig/fs/ext4/dir.c 2008-06-05 13:44:20.521046407 -0500 +++ linux-2.6/fs/ext4/dir.c 2008-07-05 12:51:23.713291581 -0500 @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * fi struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, + 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); @@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb while (n) { /* Do the node's children first */ - if ((n)->rb_left) { + if (n->rb_left) { n = n->rb_left; continue; } @@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb parent->rb_right = NULL; n = parent; } - root->rb_node = NULL; } -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; p->curr_hash = pos2maj_hash(pos); p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; return p; } @@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; Index: linux-2.6/fs/ext4/xattr_trusted.c =================================================================== --- linux-2.6.orig/fs/ext4/xattr_trusted.c 2008-06-05 13:44:20.539983023 -0500 +++ linux-2.6/fs/ext4/xattr_trusted.c 2008-07-05 12:51:21.985654007 -0500 @@ -13,13 +13,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) Index: linux-2.6/fs/ext4/xattr_user.c =================================================================== --- linux-2.6.orig/fs/ext4/xattr_user.c 2008-06-05 13:44:20.539983023 -0500 +++ linux-2.6/fs/ext4/xattr_user.c 2008-07-05 12:51:21.993609879 -0500 @@ -12,13 +12,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) Index: linux-2.6/fs/ext4/group.h =================================================================== --- linux-2.6.orig/fs/ext4/group.h 2008-06-05 13:44:20.524045910 -0500 +++ linux-2.6/fs/ext4/group.h 2008-07-05 12:51:22.039290556 -0500 @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struc struct ext4_group_desc *gdp); extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); -struct buffer_head *read_block_bitmap(struct super_block *sb, +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, Index: linux-2.6/fs/ext4/ialloc.c =================================================================== --- linux-2.6.orig/fs/ext4/ialloc.c 2008-06-05 13:44:20.524045910 -0500 +++ linux-2.6/fs/ext4/ialloc.c 2008-07-05 12:51:22.214291112 -0500 @@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct ext4_super_block * es; struct ext4_sb_info *sbi; int fatal = 0, err; + ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk ("ext4_free_inode: inode has count=%d\n", @@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } } BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); err = ext4_journal_dirty_metadata(handle, bh2); @@ -286,6 +293,80 @@ static int find_group_dir(struct super_b return ret; } +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct buffer_head *bh; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = sbi->s_groups_count; + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = flex_group[best_flex].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (flex_group[best_flex].free_inodes && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = flex_group[i].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + flex_group[i].free_inodes) { + best_flex = i; + goto found_flexbg; + } + + if (best_flex < 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) + best_flex = i; + } + + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, &bh); + if (le16_to_cpu(desc->bg_free_inodes_count)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + /* * Orlov's allocator for directories. * @@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *h struct inode *ret; ext4_group_t i; int free = 0; + ext4_group_t flex_group; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *h sbi = EXT4_SB(sb); es = sbi->s_es; + + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; + } + if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); @@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *h } else ret2 = find_group_other(sb, dir, &group); +got_group: err = -ENOSPC; if (ret2 == -1) goto out; @@ -600,7 +689,7 @@ got: /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = read_block_bitmap(sb, group); + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); BUFFER_TRACE(block_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, block_bh); @@ -676,6 +765,13 @@ got: percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + inode->i_uid = current->fsuid; if (test_opt (sb, GRPID)) inode->i_gid = dir->i_gid; Index: linux-2.6/fs/ext4/super.c =================================================================== --- linux-2.6.orig/fs/ext4/super.c 2008-06-16 11:34:11.903810444 -0500 +++ linux-2.6/fs/ext4/super.c 2008-07-05 12:51:23.998291006 -0500 @@ -506,6 +506,7 @@ static void ext4_put_super (struct super ext4_ext_release(sb); ext4_xattr_put_super(sb); jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -517,6 +518,7 @@ static void ext4_put_super (struct super for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(st memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); return &ei->vfs_inode; } @@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inod EXT4_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode); } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) @@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_ unsigned long def_mount_opts; struct super_block *sb = vfs->mnt_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - journal_t *journal = sbi->s_journal; struct ext4_super_block *es = sbi->s_es; def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_ seq_puts(seq, ",nomballoc"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC)) + seq_puts(seq, ",nodelalloc"); + if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); @@ -894,7 +906,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, }; static match_table_t tokens = { @@ -953,6 +965,8 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, {Opt_err, NULL}, }; @@ -990,6 +1004,7 @@ static int parse_options (char *options, int qtype, qfmt; char *qname; #endif + ext4_fsblk_t last_block; if (!options) return 1; @@ -1312,12 +1327,29 @@ set_qf_format: set_opt (sbi->s_mount_opt, EXTENTS); break; case Opt_noextents: + /* + * When e2fsprogs support resizing an already existing + * ext3 file system to greater than 2**32 we need to + * add support to block allocator to handle growing + * already existing block mapped inode so that blocks + * allocated for them fall within 2**32 + */ + last_block = ext4_blocks_count(sbi->s_es) - 1; + if (last_block > 0xffffffffULL) { + printk(KERN_ERR "EXT4-fs: Filesystem too " + "large to mount with " + "-o noextents options\n"); + return 0; + } clear_opt (sbi->s_mount_opt, EXTENTS); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; break; + case Opt_nodelalloc: + clear_opt(sbi->s_mount_opt, DELALLOC); + break; case Opt_mballoc: set_opt(sbi->s_mount_opt, MBALLOC); break; @@ -1331,6 +1363,9 @@ set_qf_format: return 0; sbi->s_stripe = option; break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1443,6 +1478,54 @@ static int ext4_setup_super(struct super return res; } +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *bh; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + int groups_per_flex = 0; + __u64 block_bitmap = 0; + int i; + + if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / + groups_per_flex; + sbi->s_flex_groups = kmalloc(flex_group_count * + sizeof(struct flex_groups), GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + printk(KERN_ERR "EXT4-fs: not enough memory\n"); + goto failed; + } + memset(sbi->s_flex_groups, 0, flex_group_count * + sizeof(struct flex_groups)); + + gdp = ext4_get_group_desc(sb, 1, &bh); + block_bitmap = ext4_block_bitmap(sb, gdp) - 1; + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, &bh); + + flex_group = ext4_flex_group(sbi, i); + sbi->s_flex_groups[flex_group].free_inodes += + le16_to_cpu(gdp->bg_free_inodes_count); + sbi->s_flex_groups[flex_group].free_blocks += + le16_to_cpu(gdp->bg_free_blocks_count); + } + + return 1; +failed: + return 0; +} + __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, struct ext4_group_desc *gdp) { @@ -1810,8 +1893,8 @@ static unsigned long ext4_get_stripe_siz } static int ext4_fill_super (struct super_block *sb, void *data, int silent) - __releases(kernel_sem) - __acquires(kernel_sem) + __releases(kernel_lock) + __acquires(kernel_lock) { struct buffer_head * bh; @@ -1851,11 +1934,6 @@ static int ext4_fill_super (struct super goto out_fail; } - if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); - goto out_fail; - } - /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. @@ -1928,6 +2006,13 @@ static int ext4_fill_super (struct super */ set_opt(sbi->s_mount_opt, MBALLOC); + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + set_opt(sbi->s_mount_opt, DELALLOC); + + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, NULL, 0)) goto failed_mount; @@ -2138,6 +2223,14 @@ static int ext4_fill_super (struct super printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); goto failed_mount2; } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + printk(KERN_ERR + "EXT4-fs: unable to initialize " + "flex_bg meta info!\n"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -2358,6 +2451,13 @@ static int ext4_fill_super (struct super test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + ext4_ext_init(sb); ext4_mb_init(sb, needs_recovery); @@ -2372,6 +2472,7 @@ cantfind_ext4: failed_mount4: jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -3325,7 +3426,7 @@ static ssize_t ext4_quota_write(struct s err = ext4_journal_dirty_metadata(handle, bh); else { /* Always do at least ordered writes for quotas */ - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } brelse(bh); @@ -3337,8 +3438,10 @@ static ssize_t ext4_quota_write(struct s blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) { i_size_write(inode, off+len-towrite); EXT4_I(inode)->i_disksize = inode->i_size; Index: linux-2.6/fs/jbd2/commit.c =================================================================== --- linux-2.6.orig/fs/jbd2/commit.c 2008-06-16 11:34:12.070629463 -0500 +++ linux-2.6/fs/jbd2/commit.c 2008-07-05 12:51:23.906290812 -0500 @@ -22,6 +22,8 @@ #include #include #include +#include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(s } /* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -80,21 +82,6 @@ nope: } /* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - -/* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write @@ -112,6 +99,7 @@ static int journal_submit_commit_record( struct buffer_head *bh; int ret; int barrier_done = 0; + struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) return 0; @@ -126,6 +114,8 @@ static int journal_submit_commit_record( tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { @@ -197,159 +187,104 @@ static int journal_wait_on_commit_record } /* - * Wait for all submitted IO to complete. + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). */ -static int journal_wait_on_locked_list(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_inode_data_buffers(struct address_space *mapping) { - int ret = 0; - struct journal_head *jh; - - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + .for_writepages = 1, + }; - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - ret = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } + ret = generic_writepages(mapping, &wbc); return ret; - } +} -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - int i; + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + err = journal_submit_inode_data_buffers(mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } + spin_unlock(&journal->j_list_lock); + return ret; } /* - * Submit all the data buffers to disk + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * */ -static void journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); + /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - jbd2_journal_remove_journal_head(bh); - /* Once for our safety reference, once for - * jbd2_journal_remove_journal_head() */ - put_bh(bh); - put_bh(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; + jinode->i_transaction = NULL; } } spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); + + return ret; } static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) @@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(jou * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); - - /* - * Wait for all previously submitted IO to complete if commit - * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - err = journal_wait_on_locked_list(journal, - commit_transaction); - - spin_unlock(&journal->j_list_lock); - + err = journal_submit_data_buffers(journal, commit_transaction); if (err) jbd2_journal_abort(journal, err); @@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(jou jbd_debug(3, "JBD: commit phase 2\n"); /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - - /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: @@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(jou J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); + err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -748,15 +660,19 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - - spin_lock(&journal->j_list_lock); - err = journal_wait_on_locked_list(journal, - commit_transaction); - spin_unlock(&journal->j_list_lock); - if (err) - __jbd2_journal_abort_hard(journal); } + /* + * This is the right place to wait for data buffers both for ASYNC + * and !ASYNC commit. If commit is ASYNC, we need to wait only after + * the commit block went to disk (which happens above). If commit is + * SYNC, we need to wait for data buffers before we start writing + * commit block, which happens below in such setting. + */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -768,7 +684,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -827,7 +743,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -854,7 +770,7 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -874,9 +790,9 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 6\n"); - J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); @@ -997,7 +913,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 8\n"); + jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); Index: linux-2.6/include/linux/jbd2.h =================================================================== --- linux-2.6.orig/include/linux/jbd2.h 2008-06-16 11:34:12.762013789 -0500 +++ linux-2.6/include/linux/jbd2.h 2008-07-05 12:51:23.276619947 -0500 @@ -168,6 +168,8 @@ struct commit_header { unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; + __be64 h_commit_sec; + __be32 h_commit_nsec; }; /* @@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal bit_spin_unlock(BH_JournalHead, &bh->b_state); } +/* Flags in jbd_inode->i_flags */ +#define __JI_COMMIT_RUNNING 0 +/* Commit of the inode data in progress. We use this flag to protect us from + * concurrent deletion of inode. We cannot use reference to inode for this + * since we cannot afford doing last iput() on behalf of kjournald + */ +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) + +/** + * struct jbd_inode is the structure linking inodes in ordered mode + * present in a transaction so that we can sync them during commit. + */ +struct jbd2_inode { + /* Which transaction does this inode belong to? Either the running + * transaction or the committing one. [j_list_lock] */ + transaction_t *i_transaction; + + /* Pointer to the running transaction modifying inode's data in case + * there is already a committing transaction touching it. [j_list_lock] */ + transaction_t *i_next_transaction; + + /* List of inodes in the i_transaction [j_list_lock] */ + struct list_head i_list; + + /* VFS inode this inode belongs to [constant during the lifetime + * of the structure] */ + struct inode *i_vfs_inode; + + /* Flags of inode [j_list_lock] */ + unsigned int i_flags; +}; + struct jbd2_revoke_table_s; /** @@ -509,24 +543,12 @@ struct transaction_s struct journal_head *t_reserved_list; /* - * Doubly-linked circular list of all buffers under writeout during - * commit [j_list_lock] - */ - struct journal_head *t_locked_list; - - /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock] */ struct journal_head *t_buffers; /* - * Doubly-linked circular list of all data buffers still to be - * flushed before this transaction can be committed [j_list_lock] - */ - struct journal_head *t_sync_datalist; - - /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] @@ -565,6 +587,12 @@ struct transaction_s struct journal_head *t_log_list; /* + * List of inodes whose data we've modified in data=ordered mode. + * [j_list_lock] + */ + struct list_head t_inode_list; + + /* * Protects info related to handles */ spinlock_t t_handle_lock; @@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_ extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); -extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); @@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err ( extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); +extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); +extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); +extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management @@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ -#define BJ_SyncData 1 /* Normal data: flush before commit */ -#define BJ_Metadata 2 /* Normal journaled metadata */ -#define BJ_Forget 3 /* Buffer superseded by this transaction */ -#define BJ_IO 4 /* Buffer is for temporary IO use */ -#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ -#define BJ_LogCtl 6 /* Buffer contains log descriptors */ -#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ -#define BJ_Locked 8 /* Locked for I/O during commit */ -#define BJ_Types 9 +#define BJ_Metadata 1 /* Normal journaled metadata */ +#define BJ_Forget 2 /* Buffer superseded by this transaction */ +#define BJ_IO 3 /* Buffer is for temporary IO use */ +#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ +#define BJ_LogCtl 5 /* Buffer contains log descriptors */ +#define BJ_Reserved 6 /* Buffer is reserved for access by journal */ +#define BJ_Types 7 extern int jbd_blocks_per_page(struct inode *inode); Index: linux-2.6/fs/jbd2/transaction.c =================================================================== --- linux-2.6.orig/fs/jbd2/transaction.c 2008-06-05 13:44:21.036983179 -0500 +++ linux-2.6/fs/jbd2/transaction.c 2008-07-05 12:51:23.260291032 -0500 @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_b * new transaction and we can't block without protecting against other * processes trying to touch the journal while it is in transition. * - * Called under j_state_lock */ static transaction_t * @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + INIT_LIST_HEAD(&transaction->t_inode_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); @@ -943,183 +943,6 @@ out: } /** - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. - * @handle: transaction - * @bh: bufferhead to mark - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - - if (is_handle_aborted(handle)) - return 0; - - jh = jbd2_journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls jbd2_journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { - JBUFFER_TRACE(jh, "unfile from commit"); - __jbd2_journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __jbd2_journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; - JBUFFER_TRACE(jh, "file as data"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - jbd2_journal_put_journal_head(jh); - return 0; -} - -/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head * * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one + * of these pointers, it could go bad. Generally the caller needs to re-read + * the pointer from the transaction_t. * * Called under j_list_lock. The journal may not be locked. */ @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(s switch (jh->b_jlist) { case BJ_None: return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(s case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_del_buffer(list, jh); @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t * goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1656,12 +1465,43 @@ out: return; } +/* + * jbd2_journal_try_to_free_buffers() could race with + * jbd2_journal_commit_transaction(). The later might still hold the + * reference count to the buffers when inspecting them on + * t_syncdata_list or t_locked_list. + * + * jbd2_journal_try_to_free_buffers() will call this function to + * wait for the current transaction to finish syncing data buffers, before + * try to free that buffer. + * + * Called with journal->j_state_lock hold. + */ +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return; + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); +} /** * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. * * * For all the buffers on this page, @@ -1690,9 +1530,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ int jbd2_journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(jou /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). + * jbd2_journal_remove_journal_head() and + * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(jou if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + + /* + * There are a number of places where jbd2_journal_try_to_free_buffers() + * could race with jbd2_journal_commit_transaction(), the later still + * holds the reference to the buffers to free while processing them. + * try_to_free_buffers() failed to free those buffers. Some of the + * caller of releasepage() request page buffers to be dropped, otherwise + * treat the fail-to-free as errors (such as generic_file_direct_IO()) + * + * So, if the caller of try_to_release_page() wants the synchronous + * behaviour(i.e make sure buffers are dropped upon return), + * let's wait for the current transaction to finish flush of + * dirty data buffers, then try to free those buffers again, + * with the journal locked. + */ + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { + jbd2_journal_wait_for_transaction_sync_data(journal); + ret = try_to_free_buffers(page); + } + busy: return ret; } @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_ if (!buffer_jbd(bh)) goto zap_buffer_unlocked; + /* OK, we have data buffer in journaled mode */ spin_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_ } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } /* * If it is committing, we simply cannot touch it. We * can remove it's next_transaction pointer from the @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct j J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct j case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_add_buffer(list, jh); @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_ spin_unlock(&journal->j_list_lock); __brelse(bh); } + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + + if (is_handle_aborted(handle)) + return -EIO; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * This function must be called when inode is journaled in ordered mode + * before truncation happens. It starts writeout of truncated part in + * case it is in the committing transaction so that we stand to ordered + * mode consistency guarantees. + */ +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + journal_t *journal; + transaction_t *commit_trans; + int ret = 0; + + if (!inode->i_transaction && !inode->i_next_transaction) + goto out; + journal = inode->i_transaction->t_journal; + spin_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + spin_unlock(&journal->j_state_lock); + if (inode->i_transaction == commit_trans) { + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} Index: linux-2.6/fs/ext4/extents.c =================================================================== --- linux-2.6.orig/fs/ext4/extents.c 2008-06-16 11:32:42.791046936 -0500 +++ linux-2.6/fs/ext4/extents.c 2008-07-05 12:51:24.673291650 -0500 @@ -40,6 +40,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" @@ -48,7 +49,7 @@ * ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ -static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) +ext4_fsblk_t ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; @@ -92,17 +93,16 @@ static void ext4_idx_store_pblock(struct ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) +int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; if (handle->h_buffer_credits > needed) - return handle; - if (!ext4_journal_extend(handle, needed)) - return handle; - err = ext4_journal_restart(handle, needed); - - return handle; + return 0; + err = ext4_journal_extend(handle, needed); + if (err) + return err; + return ext4_journal_restart(handle, needed); } /* @@ -142,7 +142,7 @@ static int ext4_ext_dirty(handle_t *hand return err; } -static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, +ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { @@ -180,15 +180,24 @@ static ext4_fsblk_t ext4_ext_find_goal(s return bg_start + colour + block; } +/* + * Allocation for a meta data block + */ static ext4_fsblk_t -ext4_ext_new_block(handle_t *handle, struct inode *inode, +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex, int *err) + struct ext4_extent *ex, int *err, + ext4_fsblk_t defrag_goal) { ext4_fsblk_t goal, newblock; - goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_block(handle, inode, goal, err); + if (defrag_goal) + goal = defrag_goal; + else + goal = ext4_ext_find_goal(inode, path, + le32_to_cpu(ex->ee_block)); + + newblock = ext4_new_meta_block(handle, inode, goal, err); return newblock; } @@ -246,6 +255,36 @@ static int ext4_ext_space_root_idx(struc return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -524,6 +563,7 @@ ext4_ext_find_extent(struct inode *inode alloc = 1; } path[0].p_hdr = eh; + path[0].p_bh = NULL; i = depth; /* walk through the tree */ @@ -552,12 +592,14 @@ ext4_ext_find_extent(struct inode *inode } path[ppos].p_depth = i; - path[ppos].p_hdr = eh; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); @@ -638,7 +680,8 @@ static int ext4_ext_insert_index(handle_ */ static int ext4_ext_split(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int at) + struct ext4_extent *newext, int at, + ext4_fsblk_t defrag_goal) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); @@ -688,7 +731,8 @@ static int ext4_ext_split(handle_t *hand /* allocate all needed blocks */ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, defrag_goal); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -875,7 +919,8 @@ cleanup: */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, + ext4_fsblk_t defrag_goal) { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; @@ -884,7 +929,8 @@ static int ext4_ext_grow_indepth(handle_ ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, defrag_goal); if (newblock == 0) return err; @@ -960,7 +1006,8 @@ out: */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, + ext4_fsblk_t defrag_goal) { struct ext4_ext_path *curp; int depth, i, err = 0; @@ -980,7 +1027,10 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, path, newext, i); + err = ext4_ext_split(handle, inode, path, newext, i, + defrag_goal); + if (err) + goto out; /* refill path */ ext4_ext_drop_refs(path); @@ -991,7 +1041,8 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, path, newext); + err = ext4_ext_grow_indepth(handle, inode, path, + newext, defrag_goal); if (err) goto out; @@ -1171,7 +1222,7 @@ ext4_ext_search_right(struct inode *inod * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -1437,6 +1488,19 @@ int ext4_ext_insert_extent(handle_t *han struct ext4_ext_path *path, struct ext4_extent *newext) { + return ext4_ext_insert_extent_defrag(handle, inode, path, newext, 0); +} + +/* + * ext4_ext_insert_extent_defrag: + * The difference from ext4_ext_insert_extent is to use the first block + * in newext as the goal of the new index block. + */ +int +ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int defrag) +{ struct ext4_extent_header * eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ @@ -1444,6 +1508,7 @@ int ext4_ext_insert_extent(handle_t *han int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; + ext4_fsblk_t defrag_goal; BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); @@ -1504,11 +1569,16 @@ repeat: le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } + if (defrag) + defrag_goal = ext_pblock(newext); + else + defrag_goal = 0; /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - err = ext4_ext_create_new_leaf(handle, inode, path, newext); + err = ext4_ext_create_new_leaf(handle, inode, path, + newext, defrag_goal); if (err) goto cleanup; depth = ext_depth(inode); @@ -1587,6 +1657,112 @@ cleanup: return err; } +int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext4_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT4_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, __u32 len, ext4_fsblk_t start, int type) @@ -1883,11 +2059,9 @@ ext4_ext_rm_leaf(handle_t *handle, struc credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); #endif - handle = ext4_ext_journal_restart(handle, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); + err = ext4_ext_journal_restart(handle, credits); + if (err) goto out; - } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -1956,7 +2130,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2529,6 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle int err = 0, depth, ret; unsigned long allocated = 0; struct ext4_allocation_request ar; + loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2616,8 +2791,7 @@ int ext4_ext_get_blocks(handle_t *handle */ if (allocated > max_blocks) allocated = max_blocks; - /* mark the buffer unwritten */ - __set_bit(BH_Unwritten, &bh_result->b_state); + set_buffer_unwritten(bh_result); goto out2; } @@ -2716,14 +2890,19 @@ int ext4_ext_get_blocks(handle_t *handle goto out2; } - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; - /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - __set_bit(BH_New, &bh_result->b_state); + if (extend_disksize) { + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + } + + set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ if (create != EXT4_CREATE_UNINITIALIZED_EXT) @@ -2733,7 +2912,7 @@ out: if (allocated > max_blocks) allocated = max_blocks; ext4_ext_show_leaf(inode, path); - __set_bit(BH_Mapped, &bh_result->b_state); + set_buffer_mapped(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; out2: @@ -2744,7 +2923,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode * inode, struct page *page) +void ext4_ext_truncate(struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2757,18 +2936,14 @@ void ext4_ext_truncate(struct inode * in */ err = ext4_writepage_trans_blocks(inode) + 3; handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; - } - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + if (ext4_orphan_add(handle, inode)) + goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); @@ -2780,8 +2955,6 @@ void ext4_ext_truncate(struct inode * in * Probably we need not scan at all, * because page truncation is enough. */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; @@ -2911,7 +3084,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -2945,3 +3118,183 @@ retry: mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } + +struct fiemap_internal { + struct fiemap *fiemap_s; + struct fiemap_extent fm_extent; + size_t tot_mapping_len; + char *cur_ext_ptr; + int current_extent; + int err; +}; + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + struct fiemap_internal *fiemap_i = data; + struct fiemap *fiemap_s = fiemap_i->fiemap_s; + struct fiemap_extent *fm_extent = &fiemap_i->fm_extent; + int current_extent = fiemap_i->current_extent; + unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + + /* + * ext4_ext_walk_space returns a hole for extents that have not been + * allocated yet. + */ + if (((u64)(newex->ec_block + newex->ec_len) << blksize_bits >= + inode->i_size) && newex->ec_type == EXT4_EXT_CACHE_GAP) { + if (((u64)newex->ec_block << blksize_bits) < inode->i_size) + newex->ec_len = (inode->i_size - ((u64)newex->ec_block<< + blksize_bits)) >> blksize_bits; + else + return EXT_BREAK; + } + + /* + * We only need to return number of extents and total length of mapping + */ + if (fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS) { + fiemap_i->tot_mapping_len += ((__u64)newex->ec_len << + blksize_bits); + goto count_extents; + } + + if (current_extent >= fiemap_s->fm_extent_count) + return EXT_BREAK; + + /* caller's start should be set to the start of the first extent (or hole...?) */ + if (newex->ec_block << blksize_bits < fiemap_s->fm_start) + fiemap_s->fm_start = newex->ec_block << blksize_bits; + + memset(fm_extent, 0, sizeof(*fm_extent)); + fm_extent->fe_offset = (__u64)newex->ec_start << blksize_bits; + fm_extent->fe_length = (__u64)newex->ec_len << blksize_bits; + fiemap_i->tot_mapping_len += fm_extent->fe_length; /* move this above the goto? */ + + if (newex->ec_type == EXT4_EXT_CACHE_GAP) + fm_extent->fe_flags |= FIEMAP_EXTENT_HOLE; + else if (ex && ext4_ext_is_uninitialized(ex)) + fm_extent->fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + + /* + * Mark this fiemap_extent as FIEMAP_EXTENT_EOF if it's past the end + * of file. + */ + /* block + len to bytes... >= size? check off by one */ + if ((u64)(newex->ec_block + newex->ec_len) << blksize_bits >= + inode->i_size) + fm_extent->fe_flags |= FIEMAP_EXTENT_EOF; + // XXX ERS HACK AROUND _LAST problem + //fm_extent->fe_flags |= (FIEMAP_EXTENT_EOF|FIEMAP_EXTENT_LAST); + + if (!copy_to_user(fiemap_i->cur_ext_ptr, fm_extent, + sizeof(struct fiemap_extent))) { + /* c_t_u succeeded, advance current exent ptr to next */ + fiemap_i->cur_ext_ptr += sizeof(struct fiemap_extent); + } else { + fiemap_i->err = -EFAULT; + return EXT_BREAK; + } + +count_extents: + /* + * Don't count holes when only returning number of extents + * XXX ERS hm, ok if that's how it's defined... + */ + if (!((fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS) && + (newex->ec_type == EXT4_EXT_CACHE_GAP))) + fiemap_i->current_extent++; /* hmm why? oh, advance count */ + + /* + * Stop if we are beyond requested mapping size but return complete last + * extent. + */ + + /* is this extent's last byte >= length of mapping? + * (XXX really? not start+length of mapping? */ + if ((u64)(newex->ec_block + newex->ec_len) << blksize_bits >= + fiemap_s->fm_length) + return EXT_BREAK; + + return EXT_CONTINUE; +} + +int ext4_fiemap(struct inode *inode, unsigned long arg) +{ + struct fiemap *fiemap_s; + struct fiemap_internal fiemap_i; + struct fiemap_extent *last_extent; + ext4_lblk_t start_blk; + int fm_extent_size = sizeof(struct fiemap_extent); + int err = 0; + + /* could use getblock here for non-extent files? */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -EOPNOTSUPP; + + fiemap_s = kmalloc(sizeof(*fiemap_s), GFP_KERNEL); + if (fiemap_s == NULL) + return -ENOMEM; + + if (copy_from_user(fiemap_s, (struct fiemap __user *)arg, + sizeof(*fiemap_s))) { + err = -EFAULT; + goto out_free; + } + + /* bail on unsupported flags for this fs */ + if (fiemap_s->fm_flags & EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP) { + err = -EOPNOTSUPP; + goto out_free; + } + + start_blk = fiemap_s->fm_start >> inode->i_sb->s_blocksize_bits; + fiemap_i.fiemap_s = fiemap_s; + fiemap_i.tot_mapping_len = 0; + fiemap_i.cur_ext_ptr = (char *)(arg + sizeof(*fiemap_s)); + fiemap_i.current_extent = 0; + fiemap_i.err = 0; + + start_blk = fiemap_s->fm_start >> inode->i_sb->s_blocksize_bits; + + /* + * Walk the extent tree gathering extent information + */ + down_write(&EXT4_I(inode)->i_data_sem); + err = ext4_ext_walk_space(inode, start_blk, EXT_MAX_BLOCK - start_blk, + ext4_ext_fiemap_cb, &fiemap_i); + up_write(&EXT4_I(inode)->i_data_sem); + if (err) + goto out_free; + + fiemap_s->fm_extent_count = fiemap_i.current_extent; + fiemap_s->fm_length = fiemap_i.tot_mapping_len; + /* + * Mark last extent as EXTENT_LAST and copy the extent to userspace.` + * XXX ERS fixme, this isn't always working. + */ + if (fiemap_i.current_extent != 0 && + fiemap_i.current_extent < fiemap_s->fm_extent_count && + !(fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS)) { + char *dest; + + last_extent = &fiemap_i.fm_extent; + last_extent->fe_flags |= FIEMAP_EXTENT_LAST; + dest = (char *)arg + sizeof(*fiemap_s) + fm_extent_size * + (fiemap_s->fm_extent_count - 1); + err = copy_to_user(dest, last_extent, fm_extent_size); + if (err) + goto out_free; + } + + err = copy_to_user((void *)arg, fiemap_s, sizeof(*fiemap_s)); + +out_free: + kfree(fiemap_s); + return err; +} + Index: linux-2.6/fs/ext4/namei.c =================================================================== --- linux-2.6.orig/fs/ext4/namei.c 2008-06-05 13:44:20.537983099 -0500 +++ linux-2.6/fs/ext4/namei.c 2008-07-05 12:51:22.457290720 -0500 @@ -231,13 +231,13 @@ static inline unsigned dx_root_limit (st { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit (struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* Index: linux-2.6/fs/ext4/inode.c =================================================================== --- linux-2.6.orig/fs/ext4/inode.c 2008-06-05 13:44:20.526046257 -0500 +++ linux-2.6/fs/ext4/inode.c 2008-07-05 12:51:24.322353933 -0500 @@ -38,6 +38,16 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); /* * Test whether an inode is a fast symlink. @@ -181,6 +191,8 @@ void ext4_delete_inode (struct inode * i { handle_t *handle; + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -508,11 +520,12 @@ static int ext4_blks_to_allocate(Indirec * direct blocks */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) { int target, i; - unsigned long count = 0; + unsigned long count = 0, blk_allocated = 0; int index = 0; ext4_fsblk_t current_block = 0; int ret = 0; @@ -525,12 +538,13 @@ static int ext4_alloc_blocks(handle_t *h * the first direct block of this branch. That's the * minimum number of blocks need to allocate(required) */ - target = blks + indirect_blks; - - while (1) { + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_blocks(handle,inode,goal,&count,err); + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); if (*err) goto failed_out; @@ -540,16 +554,48 @@ static int ext4_alloc_blocks(handle_t *h new_blocks[index++] = current_block++; count--; } - - if (count > 0) + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); break; + } } - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + count = target; + /* allocating blocks for data blocks */ + current_block = ext4_new_blocks(handle, inode, iblock, + goal, &count, err); + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += count; + } +allocated: /* total number of blocks allocated for direct blocks */ - ret = count; + ret = blk_allocated; *err = 0; return ret; failed_out: @@ -584,8 +630,9 @@ failed_out: * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; int i, n = 0; @@ -595,7 +642,7 @@ static int ext4_alloc_branch(handle_t *h ext4_fsblk_t new_blocks[4]; ext4_fsblk_t current_block; - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, *blks, new_blocks, &err); if (err) return err; @@ -799,6 +846,7 @@ int ext4_get_blocks_handle(handle_t *han struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; + loff_t disksize; J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); @@ -855,8 +903,9 @@ int ext4_get_blocks_handle(handle_t *han /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers @@ -873,8 +922,13 @@ int ext4_get_blocks_handle(handle_t *han * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; + if (!err && extend_disksize) { + disksize = ((loff_t) iblock + count) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > ei->i_disksize) + ei->i_disksize = disksize; + } if (err) goto cleanup; @@ -934,7 +988,7 @@ out: */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize) + int create, int extend_disksize, int flag) { int retval; @@ -975,6 +1029,15 @@ int ext4_get_blocks_wrap(handle_t *handl * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -996,11 +1059,12 @@ int ext4_get_blocks_wrap(handle_t *handl ~EXT4_EXT_MIGRATE; } } + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 0; up_write((&EXT4_I(inode)->i_data_sem)); return retval; } - -static int ext4_get_block(struct inode *inode, sector_t iblock, +int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); @@ -1021,7 +1085,7 @@ static int ext4_get_block(struct inode * } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1047,7 +1111,7 @@ struct buffer_head *ext4_getblk(handle_t dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create, 1, 0); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1203,19 +1267,20 @@ static int ext4_write_begin(struct file to = from + len; retry: - page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); ret = PTR_ERR(handle); goto out; } + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1225,8 +1290,8 @@ retry: } if (ret) { - ext4_journal_stop(handle); unlock_page(page); + ext4_journal_stop(handle); page_cache_release(page); } @@ -1236,15 +1301,6 @@ out: return ret; } -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = jbd2_journal_dirty_data(handle, bh); - if (err) - ext4_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1255,29 +1311,6 @@ static int write_end_fn(handle_t *handle } /* - * Generic write_end handler for ordered and writeback ext4 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run - * after block_write_end. - */ -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - return copied; -} - -/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1290,15 +1323,14 @@ static int ext4_ordered_write_end(struct struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; unsigned from, to; int ret = 0, ret2; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext4_journal_dirty_data); + ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { /* @@ -1311,7 +1343,7 @@ static int ext4_ordered_write_end(struct new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1320,8 +1352,6 @@ static int ext4_ordered_write_end(struct ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1332,7 +1362,7 @@ static int ext4_writeback_write_end(stru struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; loff_t new_i_size; @@ -1340,7 +1370,7 @@ static int ext4_writeback_write_end(stru if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1349,8 +1379,6 @@ static int ext4_writeback_write_end(stru ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1389,14 +1417,569 @@ static int ext4_journalled_write_end(str ret = ret2; } + unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); page_cache_release(page); return ret ? ret : copied; } +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_has_free_blocks(sbi, total) < total) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return -ENOSPC; + } + + /* reduce fs free blocks counter */ + percpu_counter_sub(&sbi->s_freeblocks_counter, total); + + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +void ext4_da_release_space(struct inode *inode, int used, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + release = to_free + mdb_free; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, 0, to_release); +} + +/* + * this is a special callback for ->write_begin() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } else if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + + return ret; +} +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + if (!handle) { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + BUG_ON(!ret); + } else { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + } + + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* release reserved-but-unused meta blocks */ + if (buffer_delay(bh_result)) + ext4_da_release_space(inode, ret, 0); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + + if (EXT4_I(inode)->i_disksize == disksize) { + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + } + ret = 0; + } + return ret; +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation + */ + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); +} + +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* + * we don't want to do block allocation in writepage + * so call get_block_wrap with create = 0 + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; +} + +/* + * get called vi ext4_da_writepages after taking page lock (have journal handle) + * get called via journal_submit_inode_data_buffers (no journal handle) + * get called via shrink_page_list via pdflush (no journal handle) + * or grab_page_cache when doing write_begin (have journal handle) + */ +static int ext4_da_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned long len; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; + + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + /* + * We don't want to do block allocation + * So redirty the page and return + * We may reach here when we do a journal commit + * via journal_submit_inode_data_buffers. + * If we don't have mapping block we just ignore + * them. We can also reach here via shrink_page_list + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * The test for page_has_buffers() is subtle: + * We know the page is dirty but it lost buffers. That means + * that at some moment in time after write_begin()/write_end() + * has been called all buffers have been clean and thus they + * must have been written at least once. So they are all + * mapped and we can happily proceed with mapping them + * and writing the page. + * + * Try to initialize the buffer_heads and check whether + * all are mapped and non delay. We don't want to + * do block allocation here. + */ + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (!ret) { + page_bufs = page_buffers(page); + /* check whether all are mapped and non delay */ + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * We can't do block allocation here + * so just redity the page and unlock + * and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + else + ret = block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + + return ret; +} + +/* + * For now just follow the DIO way to estimate the max credits + * needed to write out EXT4_MAX_WRITEBACK_PAGES. + * todo: need to calculate the max credits need for + * extent based files, currently the DIO credits is based on + * indirect-blocks mapping way. + * + * Probably should have a generic way to calculate credits + * for DIO, writepages, and truncate + */ +#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS +#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + handle_t *handle = NULL; + int needed_blocks; + int ret = 0; + long to_write; + loff_t range_start = 0; + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages) + return 0; + + /* + * Estimate the worse case needed credits to write out + * EXT4_MAX_BUF_BLOCKS pages + */ + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + + to_write = wbc->nr_to_write; + if (!wbc->range_cyclic) { + /* + * If range_cyclic is not set force range_cont + * and save the old writeback_index + */ + wbc->range_cont = 1; + range_start = wbc->range_start; + } + + while (!ret && to_write) { + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + if (ext4_should_order_data(inode)) { + /* + * With ordered mode we need to add + * the inode to the journal handle + * when we do block allocation. + */ + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out_writepages; + } + + } + /* + * set the max dirty pages could be write at a time + * to fit into the reserved transaction credits + */ + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; + + to_write -= wbc->nr_to_write; + ret = mpage_da_writepages(mapping, wbc, + ext4_da_get_block_write); + ext4_journal_stop(handle); + if (wbc->nr_to_write) { + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + to_write += wbc->nr_to_write; + break; + } + wbc->nr_to_write = to_write; + } + +out_writepages: + wbc->nr_to_write = to_write; + if (range_start) + wbc->range_start = range_start; + return ret; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i=0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh))) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied -1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + /* * bmap() is special. It gets used by applications such as lilo and by @@ -1412,12 +1995,22 @@ static int ext4_journalled_write_end(str * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ -static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; journal_t *journal; int err; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of @@ -1462,21 +2055,17 @@ static int bput_one(handle_t *handle, st return 0; } -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - if (buffer_mapped(bh)) - return ext4_journal_dirty_data(handle, bh); - return 0; -} - /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext4_get_block() should be journalled - * along with the data so we don't crash and then get metadata which - * refers to old data. + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. * - * In all journalling modes block_write_full_page() will start the I/O. + * In all journaling modes block_write_full_page() will start the I/O. * * Problem: * @@ -1518,105 +2107,103 @@ static int jbd2_journal_dirty_data_fn(ha * disastrous. Any write() or metadata operation will sync the fs for * us. * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ -static int ext4_ordered_writepage(struct page *page, +static int __ext4_normal_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext4_journal_current_handle()) - goto out_fail; + if (test_opt(inode->i_sb, NOBH)) + return nobh_writepage(page, + ext4_normal_get_block_write, wbc); + else + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); +} - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); +static int ext4_normal_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t size = i_size_read(inode); + loff_t len; - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); } - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bget_one); - - ret = block_write_full_page(page, ext4_get_block, wbc); - /* - * The page can become unlocked at any point now, and - * truncate can then come in and change things. So we - * can't touch *page from now on. But *page_bufs is - * safe due to elevated refcount. - */ - - /* - * And attach them to the current transaction. But only if - * block_write_full_page() succeeded. Otherwise they are unmapped, - * and generally junk. - */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, jbd2_journal_dirty_data_fn); - if (!ret) - ret = err; - } - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; + if (!ext4_journal_current_handle()) + return __ext4_normal_writepage(page, wbc); -out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); - return ret; + return 0; } -static int ext4_writeback_writepage(struct page *page, +static int __ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; int err; - if (ext4_journal_current_handle()) - goto out_fail; + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (ret != 0) + goto out_unlock; + + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_fail; + goto out; } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); - else - ret = block_write_full_page(page, ext4_get_block, wbc); + ret = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + err = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; - return ret; -out_fail: - redirty_page_for_writepage(wbc, page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + goto out; + +out_unlock: unlock_page(page); +out: return ret; } @@ -1624,59 +2211,53 @@ static int ext4_journalled_writepage(str struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; + loff_t size = i_size_read(inode); + loff_t len; - if (ext4_journal_current_handle()) - goto no_write; + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); } - if (!page_has_buffers(page) || PageChecked(page)) { + if (ext4_journal_current_handle()) + goto no_write; + + if (PageChecked(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_get_block); - if (ret != 0) { - ext4_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - unlock_page(page); + return __ext4_journalled_writepage(page, wbc); } else { /* * It may be a page full of checkpoint-mode buffers. We don't * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - ret = block_write_full_page(page, ext4_get_block, wbc); + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); } - err = ext4_journal_stop(handle); - if (!ret) - ret = err; -out: - return ret; - no_write: redirty_page_for_writepage(wbc, page); -out_unlock: unlock_page(page); - goto out; + return 0; } static int ext4_readpage(struct file *file, struct page *page) @@ -1819,7 +2400,7 @@ static int ext4_journalled_set_page_dirt static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_ordered_writepage, + .writepage = ext4_normal_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -1833,7 +2414,7 @@ static const struct address_space_operat static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_writeback_writepage, + .writepage = ext4_normal_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -1857,10 +2438,31 @@ static const struct address_space_operat .releasepage = ext4_releasepage, }; +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_da_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, +}; + void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else @@ -1873,7 +2475,7 @@ void ext4_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, struct page *page, +int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -1882,8 +2484,13 @@ int ext4_block_truncate_page(handle_t *h ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + if (!page) + return -EINVAL; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1956,7 +2563,7 @@ int ext4_block_truncate_page(handle_t *h err = ext4_journal_dirty_metadata(handle, bh); } else { if (ext4_should_order_data(inode)) - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } @@ -2347,7 +2954,6 @@ void ext4_truncate(struct inode *inode) int n; ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -2357,41 +2963,21 @@ void ext4_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - /* - * We have to lock the EOF page here, because lock_page() nests - * outside jbd2_journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - ext4_ext_truncate(inode, page); + ext4_ext_truncate(inode); return; } handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; /* AKPM: return what? */ - } last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) @@ -2410,6 +2996,11 @@ void ext4_truncate(struct inode *inode) goto out_stop; /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the @@ -2418,12 +3009,6 @@ void ext4_truncate(struct inode *inode) */ ei->i_disksize = inode->i_size; - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -3107,7 +3692,14 @@ int ext4_write_inode(struct inode *inode * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * - * Called with inode->sem down. + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { @@ -3173,6 +3765,22 @@ int ext4_setattr(struct dentry *dentry, if (!error) error = rc; ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + goto err_out; + } + } } rc = inode_setattr(inode, attr); @@ -3506,3 +4114,64 @@ int ext4_change_inode_journal_flag(struc return err; } + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + loff_t size; + unsigned long len; + int ret = -EINVAL; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + if (PageMappedToDisk(page)) + goto out_unlock; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* return if we have all the buffers mapped */ + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) + goto out_unlock; + } + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding + * inode.i__mutex here. That allow * parallel write_begin, + * write_end call. lock_page prevent this from happening + * on the same page though + */ + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), + len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (ret < 0) + goto out_unlock; + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), + len, len, page, NULL); + if (ret < 0) + goto out_unlock; + ret = 0; +out_unlock: + up_read(&inode->i_alloc_sem); + return ret; +} Index: linux-2.6/fs/ext4/xattr.c =================================================================== --- linux-2.6.orig/fs/ext4/xattr.c 2008-06-05 13:44:20.539983023 -0500 +++ linux-2.6/fs/ext4/xattr.c 2008-07-05 12:51:22.613291454 -0500 @@ -810,7 +810,7 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_block(handle, inode, + ext4_fsblk_t block = ext4_new_meta_block(handle, inode, goal, &error); if (error) goto cleanup; Index: linux-2.6/fs/ext4/fsync.c =================================================================== --- linux-2.6.orig/fs/ext4/fsync.c 2008-06-05 13:44:20.524045910 -0500 +++ linux-2.6/fs/ext4/fsync.c 2008-07-05 12:51:22.702290943 -0500 @@ -27,6 +27,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" @@ -45,6 +46,7 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, s .nr_to_write = 0, /* sys_fsync did this */ }; ret = sync_inode(inode, &wbc); + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); } out: return ret; Index: linux-2.6/fs/ext4/resize.c =================================================================== --- linux-2.6.orig/fs/ext4/resize.c 2008-06-24 14:23:40.900193882 -0500 +++ linux-2.6/fs/ext4/resize.c 2008-07-05 12:51:22.820291009 -0500 @@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *s gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + if (test_opt(sb, MBALLOC)) { + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); + if (err) + goto exit_journal; + } + /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, * all of its blocks and inodes are already valid. @@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block handle_t *handle; int err; unsigned long freed_blocks; + ext4_group_t group; + struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block } /* Handle the remaining blocks in the last group only. */ - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { ext4_warning(sb, __func__, @@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block o_blocks_count + add); if ((err = ext4_journal_stop(handle))) goto exit_put; + + /* + * Mark mballoc pages as not up to date so that they will be updated + * next time they are loaded by ext4_mb_load_buddy. + */ + if (test_opt(sb, MBALLOC)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Get the info on the last group */ + grp = ext4_get_group_info(sb, group); + + /* Update free blocks in group info */ + ext4_mb_update_group_info(grp, add); + } + if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); Index: linux-2.6/Documentation/filesystems/ext4.txt =================================================================== --- linux-2.6.orig/Documentation/filesystems/ext4.txt 2008-06-16 11:34:05.825983154 -0500 +++ linux-2.6/Documentation/filesystems/ext4.txt 2008-07-05 12:51:24.014291691 -0500 @@ -13,72 +13,89 @@ Mailing list: linux-ext4@vger.kernel.org 1. Quick usage instructions: =========================== - - Grab updated e2fsprogs from + - Compile and install the latest version of e2fsprogs (at least + 1.41-WIP-0617) from: + ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ - This is a patchset on top of e2fsprogs-1.39, which can be found at - ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ - - It's still mke2fs -j /dev/hda1 + or grab the latest git repository from: + + git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git + + - Create a new filesystem using the ext4dev filesystem type: + + # mke2fs -t ext4dev /dev/hda1 + + Or configure an existing ext3 filesystem to support extents and set + the test_fs flag to indicate that it's ok for an in-development + filesystem to touch this filesystem: + + # tune2fs -O extents -E test_fs /dev/hda1 - - mount /dev/hda1 /wherever -t ext4dev + If the filesystem was created with 128 byte inodes, it can be + converted to use 256 byte for greater efficiency via: - - To enable extents, + # tune2fs -I 256 /dev/hda1 - mount /dev/hda1 /wherever -t ext4dev -o extents + (Note: we currently do not have tools to convert an ext4dev + filesystem back to ext3; so please do not do try this on production + filesystems.) - - The filesystem is compatible with the ext3 driver until you add a file - which has extents (ie: `mount -o extents', then create a file). + - Mounting: - NOTE: The "extents" mount flag is temporary. It will soon go away and - extents will be enabled by the "-o extents" flag to mke2fs or tune2fs + # mount -t ext4dev /dev/hda1 /wherever - When comparing performance with other filesystems, remember that - ext3/4 by default offers higher data integrity guarantees than most. So - when comparing with a metadata-only journalling filesystem, use `mount -o - data=writeback'. And you might as well use `mount -o nobh' too along - with it. Making the journal larger than the mke2fs default often helps - performance with metadata-intensive workloads. + ext3/4 by default offers higher data integrity guarantees than most. + So when comparing with a metadata-only journalling filesystem, such + as ext3, use `mount -o data=writeback'. And you might as well use + `mount -o nobh' too along with it. Making the journal larger than + the mke2fs default often helps performance with metadata-intensive + workloads. 2. Features =========== 2.1 Currently available -* ability to use filesystems > 16TB +* ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) * extent format more robust in face of on-disk corruption due to magics, * internal redunancy in tree - -2.1 Previously available, soon to be enabled by default by "mkefs.ext4": - -* dir_index and resize inode will be on by default -* large inodes will be used by default for fast EAs, nsec timestamps, etc +* improved file allocation (multi-block alloc) +* fix 32000 subdirectory limit +* nsec timestamps for mtime, atime, ctime, create time +* inode version field on disk (NFSv4, Lustre) +* reduced e2fsck time via uninit_bg feature +* journal checksumming for robustness, performance +* persistent file preallocation (e.g for streaming media, databases) +* ability to pack bitmaps and inode tables into larger virtual groups via the + flex_bg feature +* large file support +* Inode allocation using large virtual block groups via flex_bg +* delayed allocation +* large block (up to pagesize) support +* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force + the ordering) 2.2 Candidate features for future inclusion -There are several under discussion, whether they all make it in is -partly a function of how much time everyone has to work on them: +* Online defrag (patches available but not well tested) +* reduced mke2fs time via lazy itable initialization in conjuction with + the uninit_bg feature (capability to do this is available in e2fsprogs + but a kernel thread to do lazy zeroing of unused inode table blocks + after filesystem is first mounted is required for safety) + +There are several others under discussion, whether they all make it in is +partly a function of how much time everyone has to work on them. Features like +metadata checksumming have been discussed and planned for a bit but no patches +exist yet so I'm not sure they're in the near-term roadmap. -* improved file allocation (multi-block alloc, delayed alloc; basically done) -* fix 32000 subdirectory limit (patch exists, needs some e2fsck work) -* nsec timestamps for mtime, atime, ctime, create time (patch exists, - needs some e2fsck work) -* inode version field on disk (NFSv4, Lustre; prototype exists) -* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists) -* journal checksumming for robustness, performance (prototype exists) -* persistent file preallocation (e.g for streaming media, databases) +The big performance win will come with mballoc, delalloc and flex_bg +grouping of bitmaps and inode tables. Some test results available here: -Features like metadata checksumming have been discussed and planned for -a bit but no patches exist yet so I'm not sure they're in the near-term -roadmap. - -The big performance win will come with mballoc and delalloc. CFS has -been using mballoc for a few years already with Lustre, and IBM + Bull -did a lot of benchmarking on it. The reason it isn't in the first set of -patches is partly a manageability issue, and partly because it doesn't -directly affect the on-disk format (outside of much better allocation) -so it isn't critical to get into the first round of changes. I believe -Alex is working on a new set of patches right now. + - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html + - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html 3. Options ========== @@ -222,9 +239,11 @@ stripe=n Number of filesystem blocks th to use for allocation size and alignment. For RAID5/6 systems this should be the number of data disks * RAID chunk size in file system blocks. - +delalloc (*) Deferring block allocation until write-out time. +nodelalloc Disable delayed allocation. Blocks are allocation + when data is copied from user to page cache. Data Mode ---------- +========= There are 3 different data modes: * writeback mode @@ -236,10 +255,10 @@ typically provide the best ext4 performa * ordered mode In data=ordered mode, ext4 only officially journals metadata, but it logically -groups metadata and data blocks into a single unit called a transaction. When -it's time to write the new metadata out to disk, the associated data blocks -are written first. In general, this mode performs slightly slower than -writeback but significantly faster than journal mode. +groups metadata information related to data changes with the data blocks into a +single unit called a transaction. When it's time to write the new metadata +out to disk, the associated data blocks are written first. In general, +this mode performs slightly slower than writeback but significantly faster than journal mode. * journal mode data=journal mode provides full data and metadata journaling. All new data is @@ -247,7 +266,8 @@ written to the journal first, and then t In the event of a crash, the journal can be replayed, bringing both data and metadata into a consistent state. This mode is the slowest except when data needs to be read from and written to disk at the same time where it -outperforms all others modes. +outperforms all others modes. Curently ext4 does not have delayed +allocation support if this data journalling mode is selected. References ========== @@ -256,7 +276,8 @@ kernel source: programs: http://e2fsprogs.sourceforge.net/ - http://ext2resize.sourceforge.net useful links: http://fedoraproject.org/wiki/ext3-devel http://www.bullopensource.org/ext4/ + http://ext4.wiki.kernel.org/index.php/Main_Page + http://fedoraproject.org/wiki/Features/Ext4 Index: linux-2.6/fs/ext4/file.c =================================================================== --- linux-2.6.orig/fs/ext4/file.c 2008-06-05 13:44:20.524045910 -0500 +++ linux-2.6/fs/ext4/file.c 2008-07-05 12:51:24.681390891 -0500 @@ -123,6 +123,26 @@ force_commit: return ret; } +static struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* XXX ERS should this go into this file? */ +extern int ext4_fiemap(struct inode *inode, unsigned long arg); + const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -133,7 +153,7 @@ const struct file_operations ext4_file_o #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext4_file_mmap, .open = generic_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -152,5 +172,6 @@ const struct inode_operations ext4_file_ #endif .permission = ext4_permission, .fallocate = ext4_fallocate, + .fiemap = ext4_fiemap, }; Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c 2008-06-05 13:44:20.004983761 -0500 +++ linux-2.6/fs/buffer.c 2008-07-05 12:51:23.293291194 -0500 @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struc */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) + && buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1774,7 +1776,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) + && !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { @@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, */ if (pos+copied > inode->i_size) { i_size_write(inode, pos+copied); - mark_inode_dirty(inode); + i_size_changed = 1; } unlock_page(page); page_cache_release(page); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + return copied; } EXPORT_SYMBOL(generic_write_end); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h 2008-06-16 11:34:12.718481621 -0500 +++ linux-2.6/include/linux/fs.h 2008-07-05 12:51:24.606302244 -0500 @@ -228,6 +228,7 @@ extern int dir_notify_enable; #define FS_IOC_SETFLAGS _IOW('f', 2, long) #define FS_IOC_GETVERSION _IOR('v', 1, long) #define FS_IOC_SETVERSION _IOW('v', 2, long) +#define FS_IOC_FIEMAP _IOWR('f', 10, struct fiemap) #define FS_IOC32_GETFLAGS _IOR('f', 1, int) #define FS_IOC32_SETFLAGS _IOW('f', 2, int) #define FS_IOC32_GETVERSION _IOR('v', 1, int) @@ -288,6 +289,7 @@ extern int dir_notify_enable; #include #include #include +#include #include #include @@ -1273,6 +1275,7 @@ struct inode_operations { void (*truncate_range)(struct inode *, loff_t, loff_t); long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); + int (*fiemap) (struct inode *, unsigned long arg); }; struct seq_file; @@ -1741,6 +1744,8 @@ extern int wait_on_page_writeback_range( pgoff_t start, pgoff_t end); extern int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); +extern int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); extern long do_fsync(struct file *file, int datasync); extern void sync_supers(void); Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2008-06-05 13:44:44.127982824 -0500 +++ linux-2.6/mm/filemap.c 2008-07-05 12:51:23.065702996 -0500 @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_sp } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush Index: linux-2.6/fs/jbd2/journal.c =================================================================== --- linux-2.6.orig/fs/jbd2/journal.c 2008-06-05 13:44:21.035983743 -0500 +++ linux-2.6/fs/jbd2/journal.c 2008-07-05 12:51:23.236291490 -0500 @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_update EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); -EXPORT_SYMBOL(jbd2_journal_dirty_data); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_pa EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struc } /* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + int writeout = 0; + + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (jinode->i_flags & JI_COMMIT_RUNNING) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + /* Do we need to wait for data writeback? */ + if (journal->j_committing_transaction == jinode->i_transaction) + writeout = 1; + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + +/* * debugfs tunables */ #ifdef CONFIG_JBD2_DEBUG Index: linux-2.6/fs/ext4/ext4_jbd2.h =================================================================== --- linux-2.6.orig/fs/ext4/ext4_jbd2.h 2008-06-05 13:44:20.522046081 -0500 +++ linux-2.6/fs/ext4/ext4_jbd2.h 2008-07-05 12:51:23.184353602 -0500 @@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const #define ext4_journal_forget(handle, bh) \ __ext4_journal_forget(__FUNCTION__, (handle), (bh)) -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); - handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); @@ -192,6 +190,11 @@ static inline int ext4_journal_force_com return jbd2_journal_force_commit(journal); } +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); +} + /* super.c */ int ext4_force_commit(struct super_block *sb); Index: linux-2.6/fs/jbd2/checkpoint.c =================================================================== --- linux-2.6.orig/fs/jbd2/checkpoint.c 2008-06-05 13:44:21.034983059 -0500 +++ linux-2.6/fs/jbd2/checkpoint.c 2008-07-05 12:51:23.226291217 -0500 @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(jou J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); Index: linux-2.6/fs/mpage.c =================================================================== --- linux-2.6.orig/fs/mpage.c 2008-06-05 13:44:21.442983821 -0500 +++ linux-2.6/fs/mpage.c 2008-07-05 12:51:23.871291021 -0500 @@ -10,6 +10,8 @@ * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size + * 26Jul2007 alex@clusterfs.com AKA bzzz + * basic delayed allocation support */ #include @@ -710,3 +712,404 @@ int mpage_writepage(struct page *page, g return ret; } EXPORT_SYMBOL(mpage_writepage); + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct buffer_head lbh; /* extent of blocks */ + unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; + struct writeback_control *wbc; +}; + + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with __mpage_writepage() + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct mpage_data mpd_pp = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = mpd->get_block, + .use_writepage = 1, + }; + int ret = 0, err, nr_pages, i; + unsigned long index, end; + struct pagevec pvec; + + BUG_ON(mpd->next_page <= mpd->first_page); + + pagevec_init(&pvec, 0); + index = mpd->first_page; + end = mpd->next_page - 1; + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + err = __mpage_writepage(page, mpd->wbc, &mpd_pp); + + /* + * In error case, we have to continue because + * remaining pages are still locked + * XXX: unlock and re-dirty them? + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + if (mpd_pp.bio) + mpage_bio_submit(WRITE, mpd_pp.bio); + + return ret; +} + +/* + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers + * + * @mpd->inode - inode to walk through + * @exbh->b_blocknr - first block on a disk + * @exbh->b_size - amount of space in bytes + * @logical - first logical block to start assignment with + * + * the function goes through all passed space and put actual disk + * block numbers into buffer heads, dropping BH_Delay + */ +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, + struct buffer_head *exbh) +{ + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int blocks = exbh->b_size >> inode->i_blkbits; + sector_t pblock = exbh->b_blocknr, cur_logical; + struct buffer_head *head, *bh; + unsigned long index, end; + struct pagevec pvec; + int nr_pages, i; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(!page_has_buffers(page)); + + bh = page_buffers(page); + head = bh; + + /* skip blocks out of the range */ + do { + if (cur_logical >= logical) + break; + cur_logical++; + } while ((bh = bh->b_this_page) != head); + + do { + if (cur_logical >= logical + blocks) + break; + if (buffer_delay(bh)) { + bh->b_blocknr = pblock; + clear_buffer_delay(bh); + } else if (buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + + cur_logical++; + pblock++; + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + } +} + + +/* + * __unmap_underlying_blocks - just a helper function to unmap + * set of blocks described by @bh + */ +static inline void __unmap_underlying_blocks(struct inode *inode, + struct buffer_head *bh) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + int blocks, i; + + blocks = bh->b_size >> inode->i_blkbits; + for (i = 0; i < blocks; i++) + unmap_underlying_metadata(bdev, bh->b_blocknr + i); +} + +/* + * mpage_da_map_blocks - go through given space + * + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function + * + * The function skips space we know is already mapped to disk blocks. + * + * The function ignores errors ->get_block() returns, thus real + * error handling is postponed to __mpage_writepage() + */ +static void mpage_da_map_blocks(struct mpage_da_data *mpd) +{ + struct buffer_head *lbh = &mpd->lbh; + int err = 0, remain = lbh->b_size; + sector_t next = lbh->b_blocknr; + struct buffer_head new; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return; + + while (remain) { + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = remain; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { + /* + * Rather than implement own error handling + * here, we just leave remaining blocks + * unallocated and try again with ->writepage() + */ + break; + } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + /* go for the remaining blocks */ + next += new.b_size >> mpd->inode->i_blkbits; + remain -= new.b_size; + } +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, struct buffer_head *bh) +{ + struct buffer_head *lbh = &mpd->lbh; + sector_t next; + + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); + + /* + * First block in the extent + */ + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + return; + } + + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += bh->b_size; + return; + } + + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + mpage_da_map_blocks(mpd); + + /* + * Now start a new extent + */ + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + lbh->b_blocknr = logical; +} + +/* + * __mpage_da_writepage - finds extent of pages and blocks + * + * @page: page to consider + * @wbc: not used, we just follow rules + * @data: context + * + * The function finds extents of pages and scan them for all blocks. + */ +static int __mpage_da_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct mpage_da_data *mpd = data; + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head, fake; + sector_t logical; + + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using __mpage_writepage() + */ + if (mpd->next_page != mpd->first_page) { + mpage_da_map_blocks(mpd); + mpage_da_submit_io(mpd); + } + + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; + + /* + * ... and blocks + */ + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + } + + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + if (buffer_dirty(bh)) + mpage_add_bh_to_extent(mpd, logical, bh); + logical++; + } while ((bh = bh->b_this_page) != head); + } + + return 0; +} + +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * In order to avoid duplication of logic that deals with partial pages, + * multiple bio per page, etc, we find non-allocated blocks, allocate + * them with minimal calls to ->get_block() and re-use __mpage_writepage() + * + * It's important that we call __mpage_writepage() only once for each + * involved page, otherwise we'd have to implement more complicated logic + * to deal with pages w/o PG_lock or w/ PG_writeback and so on. + * + * See comments to mpage_writepages() + */ +int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block) +{ + struct mpage_da_data mpd; + int ret; + + if (!get_block) + return generic_writepages(mapping, wbc); + + mpd.wbc = wbc; + mpd.inode = mapping->host; + mpd.lbh.b_size = 0; + mpd.lbh.b_state = 0; + mpd.lbh.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.get_block = get_block; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); + + /* + * Handle last extent of pages + */ + if (mpd.next_page != mpd.first_page) { + mpage_da_map_blocks(&mpd); + mpage_da_submit_io(&mpd); + } + + return ret; +} +EXPORT_SYMBOL(mpage_da_writepages); Index: linux-2.6/include/linux/mpage.h =================================================================== --- linux-2.6.orig/include/linux/mpage.h 2008-06-05 13:44:40.006046000 -0500 +++ linux-2.6/include/linux/mpage.h 2008-07-05 12:51:23.376568065 -0500 @@ -18,6 +18,8 @@ int mpage_readpages(struct address_space int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); +int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); Index: linux-2.6/include/linux/percpu_counter.h =================================================================== --- linux-2.6.orig/include/linux/percpu_counter.h 2008-06-05 13:44:40.321982718 -0500 +++ linux-2.6/include/linux/percpu_counter.h 2008-07-05 12:51:23.460291452 -0500 @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percp void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); -s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { @@ -44,13 +44,19 @@ static inline void percpu_counter_add(st static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { - s64 ret = __percpu_counter_sum(fbc); + s64 ret = __percpu_counter_sum(fbc, 0); return ret < 0 ? 0 : ret; } +static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) +{ + return __percpu_counter_sum(fbc, 1); +} + + static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum(fbc); + return __percpu_counter_sum(fbc, 0); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) Index: linux-2.6/lib/percpu_counter.c =================================================================== --- linux-2.6.orig/lib/percpu_counter.c 2008-06-05 13:44:44.008985568 -0500 +++ linux-2.6/lib/percpu_counter.c 2008-07-05 12:51:23.540290552 -0500 @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 __percpu_counter_sum(struct percpu_counter *fbc) +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) { s64 ret; int cpu; @@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_c for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; + if (set) + *pcount = 0; } + if (set) + fbc->count = ret; + spin_unlock(&fbc->lock); return ret; } Index: linux-2.6/fs/ext4/ext4_extents.h =================================================================== --- linux-2.6.orig/fs/ext4/ext4_extents.h 2008-06-05 13:44:20.522046081 -0500 +++ linux-2.6/fs/ext4/ext4_extents.h 2008-07-05 12:51:24.715294680 -0500 @@ -124,6 +124,19 @@ struct ext4_ext_path { #define EXT4_EXT_CACHE_GAP 1 #define EXT4_EXT_CACHE_EXTENT 2 +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 #define EXT_MAX_BLOCK 0xffffffff @@ -212,6 +225,7 @@ static inline int ext4_ext_get_actual_le (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); @@ -221,6 +235,7 @@ extern int ext4_ext_try_to_merge(struct struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, @@ -228,5 +243,15 @@ extern int ext4_ext_search_left(struct i extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); +extern void ext4_ext_drop_refs(struct ext4_ext_path *path); +extern ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block); +extern int ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int defrag); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ Index: linux-2.6/include/linux/writeback.h =================================================================== --- linux-2.6.orig/include/linux/writeback.h 2008-06-05 13:44:41.026983805 -0500 +++ linux-2.6/include/linux/writeback.h 2008-07-05 12:51:23.813471334 -0500 @@ -63,6 +63,7 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ + unsigned range_cont:1; }; /* Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c 2008-06-05 13:44:44.152982933 -0500 +++ linux-2.6/mm/page-writeback.c 2008-07-05 12:51:23.846292034 -0500 @@ -956,6 +956,9 @@ retry: } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + + if (wbc->range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); Index: linux-2.6/fs/ext4/Makefile =================================================================== --- linux-2.6.orig/fs/ext4/Makefile 2008-06-05 13:44:20.520046428 -0500 +++ linux-2.6/fs/ext4/Makefile 2008-07-05 12:51:24.081329189 -0500 @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o + ext4_jbd2.o migrate.o mballoc.o defrag.o ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o Index: linux-2.6/fs/ext4/defrag.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/fs/ext4/defrag.c 2008-07-05 12:51:24.454291246 -0500 @@ -0,0 +1,2120 @@ +/* + * Copyright (c) 2008, NEC Software Tohoku, Ltd. + * Written by Takashi Sato + * Akira Fujita + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* Online defragmentation for EXT4 */ + +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "group.h" + +#define EXT_SET_EXTENT_DATA(src, dest) do { \ + dest.block = le32_to_cpu(src->ee_block); \ + dest.start = ext_pblock(src); \ + dest.len = le16_to_cpu(src->ee_len); \ + } while (0) + +/** + * ext4_defrag_next_extent - Search for the next extent and set it to "extent" + * + * @inode: inode which is searched + * @path: this will obtain data for the next extent + * @extent: pointer to the next extent we have just gotten + * + * This function returns 0 or 1(last entry) if succeed, otherwise + * returns -EIO. + */ +static int +ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) +{ + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; + return 0; + } + + while (--ppos >= 0) { + if (EXT_LAST_INDEX(path[ppos].p_hdr) > + path[ppos].p_idx) { + int cur_ppos = ppos; + + /* index block */ + path[ppos].p_idx++; + path[ppos].p_block = idx_pblock(path[ppos].p_idx); + if (path[ppos+1].p_bh) + brelse(path[ppos+1].p_bh); + path[ppos+1].p_bh = + sb_bread(inode->i_sb, path[ppos].p_block); + if (!path[ppos+1].p_bh) + goto err; + path[ppos+1].p_hdr = + ext_block_hdr(path[ppos+1].p_bh); + + /* Halfway index block */ + while (++cur_ppos < leaf_ppos) { + path[cur_ppos].p_idx = + EXT_FIRST_INDEX(path[cur_ppos].p_hdr); + path[cur_ppos].p_block = + idx_pblock(path[cur_ppos].p_idx); + if (path[cur_ppos+1].p_bh) + brelse(path[cur_ppos+1].p_bh); + path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, + path[cur_ppos].p_block); + if (!path[cur_ppos+1].p_bh) + goto err; + path[cur_ppos+1].p_hdr = + ext_block_hdr(path[cur_ppos+1].p_bh); + } + + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + return 0; + } + } + /* We found the last extent */ + return 1; +err: + if (path) + ext4_ext_drop_refs(path); + return -EIO; +} + +/** + * ext4_defrag_extents_info - Get extents information + * + * @sb: for ext4_iget() + * @ext_info: pointer to ext4_extents_info + * @ext_info->ino: describe an inode which is used to get + * extent information + * @ext_info->max_entries: defined by DEFRAG_MAX_ENT + * @ext_info->entries: amount of extents (output) + * @ext_info->ext[]: array of extent (output) + * @ext_info->offset: starting block offset of targeted extent + * (file relative) + * + * This function returns 0 if the next extent(s) exists, + * or returns 1 if the next extent doesn't exist, + * otherwise returns error value. + */ +static int +ext4_defrag_extents_info(struct super_block *sb, + struct ext4_extents_info *ext_info) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext = NULL; + struct inode *inode = NULL; + ext4_lblk_t offset = ext_info->f_offset; + int max_entries = ext_info->max_entries; + int depth, entries = 0; + int err = 0; + int ret = 0; + + inode = ext4_iget(sb, ext_info->ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + down_write(&EXT4_I(inode)->i_data_sem); + + /* Return -ENOENT if a file does not exist */ + if (!inode->i_nlink || inode->i_ino < EXT4_GOOD_OLD_FIRST_INO || + !S_ISREG(inode->i_mode)) { + ext_info->entries = 0; + err = -ENOENT; + goto out; + } + + path = ext4_ext_find_extent(inode, offset, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + depth = ext_depth(inode); + + /* Skip the 0 size file */ + if (path[depth].p_ext == NULL) { + ext_info->entries = 0; + goto out; + } + ext = path[depth].p_ext; + EXT_SET_EXTENT_DATA(ext, ext_info->ext[entries]); + entries = 1; + + /* + * The ioctl repeats this loop 'max_entries' times. + * So we have to call this function again if @inode had + * more the number of extents than 'max_entries'. + */ + while (entries < max_entries) { + ret = ext4_defrag_next_extent(inode, path, &ext); + if (ret == 0) { + /* Found the next extent (it means not the last one) */ + EXT_SET_EXTENT_DATA(ext, ext_info->ext[entries]); + entries++; + + /* + * In case @inode has > 'max_entries' extents, + * we must call this function again and restart from + * 'max_entries * n + 1'th extent. + * 'n' is the number of calling this function + * at the same @inode. + */ + if (entries == max_entries) { + ext_info->f_offset = + le32_to_cpu(ext->ee_block) + + le16_to_cpu(ext->ee_len); + /* Check the extent is the last one or not */ + ret = + ext4_defrag_next_extent(inode, path, &ext); + if (ret == 1) { + err = ret; + } else if (ret < 0) { + /* Failed to get the next extent */ + err = ret; + goto out; + } + break; + } + + } else if (ret == 1) { + /* The extent is the last one */ + ext_info->f_offset = 0; + err = ret; + break; + } else { + /* Failed to get the next extent */ + err = ret; + goto out; + } + } + + ext_info->entries = entries; + +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + up_write(&EXT4_I(inode)->i_data_sem); + iput(inode); + return err; +} + +/** + * ext4_defrag_reserve_blocks - Reserve blocks for defrag + * + * @org_inode: original inode + * @goal: the goal offset of the block reservation + * @len: blocks count we need to reserve + * + * This function returns 0 if succeed, otherwise returns error value. + */ + +static int +ext4_defrag_reserve_blocks(struct inode *org_inode, ext4_fsblk_t goal, int len) +{ + struct super_block *sb = NULL; + handle_t *handle; + struct buffer_head *bitmap_bh = NULL; + struct ext4_block_alloc_info *block_i; + struct ext4_reserve_window_node *my_rsv = NULL; + unsigned short windowsz = 0; + ext4_group_t group_no; + ext4_grpblk_t grp_target_blk; + int err = 0; + + down_write(&EXT4_I(org_inode)->i_data_sem); + + handle = ext4_journal_start(org_inode, EXT4_RESERVE_TRANS_BLOCKS); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + goto out; + } + + if (S_ISREG(org_inode->i_mode) && + !EXT4_I(org_inode)->i_block_alloc_info) { + ext4_init_block_alloc_info(org_inode); + } else if (!S_ISREG(org_inode->i_mode)) { + printk(KERN_ERR "ext4 defrag: Invalid file type\n"); + err = -EINVAL; + goto out; + } + + sb = org_inode->i_sb; + if (!sb) { + printk(KERN_ERR "ext4 defrag: Non-existent device\n"); + err = -ENXIO; + goto out; + } + ext4_get_group_no_and_offset(sb, goal, &group_no, + &grp_target_blk); + + block_i = EXT4_I(org_inode)->i_block_alloc_info; + /* Block reservation should be enabled */ + BUG_ON(!block_i); + + windowsz = block_i->rsv_window_node.rsv_goal_size; + /* Goal size should be set */ + BUG_ON(!windowsz); + + my_rsv = &block_i->rsv_window_node; + + bitmap_bh = ext4_read_block_bitmap(sb, group_no); + if (!bitmap_bh) { + err = -ENOSPC; + goto out; + } + + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + err = ext4_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto out; + + err = alloc_new_reservation(my_rsv, grp_target_blk, sb, + group_no, bitmap_bh); + if (err < 0) { + printk(KERN_ERR "ext4 defrag: Block reservation failed." + "offset [%d], bg[%lu]\n", grp_target_blk, group_no); + ext4_discard_reservation(org_inode); + goto out; + } else if (len > EXT4_DEFAULT_RESERVE_BLOCKS) { + try_to_extend_reservation(my_rsv, sb, + len - EXT4_DEFAULT_RESERVE_BLOCKS); + } + +out: + up_write(&EXT4_I(org_inode)->i_data_sem); + ext4_journal_release_buffer(handle, bitmap_bh); + brelse(bitmap_bh); + + if (handle) + ext4_journal_stop(handle); + + return err; +} + +/** + * ext4_defrag_block_within_rsv - Is target extent reserved ? + * + * @org_inode: original inode + * @ex_start: physical block offset of the extent which already moved + * @ex_len: block length of the extent + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_block_within_rsv(struct inode *org_inode, ext4_fsblk_t ex_start, + int ex_len) +{ + struct super_block *sb = org_inode->i_sb; + struct ext4_block_alloc_info *block_i; + ext4_group_t group_no; + ext4_grpblk_t grp_blk; + struct ext4_reserve_window_node *rsv; + + block_i = EXT4_I(org_inode)->i_block_alloc_info; + /* Block reservation should be enabled */ + BUG_ON(!block_i); + + /* Goal size should be set */ + BUG_ON(!block_i->rsv_window_node.rsv_goal_size); + + rsv = &block_i->rsv_window_node; + if (rsv_is_empty(&rsv->rsv_window)) { + printk(KERN_ERR "ext4 defrag: Reservation window is empty\n"); + return -ENOSPC; + } + + ext4_get_group_no_and_offset(sb, ex_start, &group_no, &grp_blk); + + if (!goal_in_my_reservation(&rsv->rsv_window, grp_blk, group_no, sb) + || !goal_in_my_reservation(&rsv->rsv_window, + grp_blk + ex_len - 1, group_no, sb)){ + /* Goal blocks are not in the reservation window */ + printk(KERN_ERR "ext4 defrag: %d or %d in bg %lu is " + "not in rsv_window\n", grp_blk, + grp_blk + ex_len - 1, group_no); + return -ENOSPC; + } + return 0; +} + +/* + * ext4_defrag_reserve_fblocks - + * Reserve free blocks with ext4_defrag_reserve_blocks + * + * @org_inode: original inode to get a block group number + * @ext_info: freeblocks distribution which stored extent-like style + * @ext_info->ext[]: an array of struct ext4_extents_data + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_reserve_fblocks(struct inode *org_inode, + struct ext4_extents_info *ext_info) +{ + ext4_fsblk_t ex_start = 0; + int i, len, ret; + + for (i = 0; i < ext_info->entries; i++) { + ex_start = ext_info->ext[i].start; + len = ext_info->ext[i].len; + + ret = ext4_defrag_reserve_blocks(org_inode, ex_start, len); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Block reservation failed. offset [%llu], " + "length [%d]\n", ex_start, len); + goto err; + } + + /* Confirm that blocks are in the reservation window */ + ret = ext4_defrag_block_within_rsv(org_inode, ex_start, len); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Reservation window is not set. " + "offset [%llu], length [%d]\n", ex_start, len); + goto err; + } + } + return ret; + +err: + down_write(&EXT4_I(org_inode)->i_data_sem); + ext4_discard_reservation(org_inode); + up_write(&EXT4_I(org_inode)->i_data_sem); + return ret; +} + +/** + * ext4_defrag_move_victim - Create free space for defrag + * + * @target_filp: target file + * @ext_info: target extents array to move + * + * This function returns 0 if succeed, otherwise + * returns error value. + */ +static int +ext4_defrag_move_victim(struct file *target_filp, + struct ext4_extents_info *ext_info) +{ + struct inode *org_inode = target_filp->f_dentry->d_inode; + struct super_block *sb = org_inode->i_sb; + struct file victim_file; + struct dentry victim_dent; + struct inode *victim_inode; + struct ext4_extent_data ext; + ext4_fsblk_t goal = ext_info->goal; + ext4_group_t group; + ext4_grpblk_t grp_off; + int ret, i; + + /* Setup dummy extent data */ + ext.len = 0; + + /* Get the inode of the victim file */ + victim_inode = ext4_iget(sb, ext_info->ino); + if (IS_ERR(victim_inode)) + return PTR_ERR(victim_inode); + + /* Setup file for the victim file */ + victim_dent.d_inode = victim_inode; + victim_file.f_dentry = &victim_dent; + victim_file.f_mapping = victim_inode->i_mapping; + + /* Set the goal appropriate offset */ + if (goal == -1) { + ext4_get_group_no_and_offset(victim_inode->i_sb, + ext_info->ext[0].start, &group, &grp_off); + goal = ext4_group_first_block_no(sb, group + 1); + } + + for (i = 0; i < ext_info->entries; i++) { + /* Move original blocks to another block group */ + ret = ext4_defrag(&victim_file, ext_info->ext[i].block, + ext_info->ext[i].len, goal, DEFRAG_FORCE_VICTIM, &ext); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Moving victim file failed. ino [%llu]\n", + ext_info->ino); + goto err; + } + + /* Sync journal blocks before reservation */ + ret = ext4_force_commit(sb); + if (ret) { + printk(KERN_ERR "ext4 defrag: " + "ext4_force_commit failed(%d)\n", ret); + goto err; + } + } + + iput(victim_inode); + return 0; +err: + down_write(&EXT4_I(org_inode)->i_data_sem); + ext4_discard_reservation(org_inode); + up_write(&EXT4_I(org_inode)->i_data_sem); + iput(victim_inode); + return ret; +} + +/** + * ext4_defrag_fblocks_distribution - Search free blocks distribution + * + * @org_inode: original inode + * @ext_info: ext4_extents_info + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_fblocks_distribution(struct inode *org_inode, + struct ext4_extents_info *ext_info) +{ + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = org_inode->i_sb; + handle_t *handle; + ext4_group_t group_no; + ext4_grpblk_t start, end; + ext4_fsblk_t start_block = 0; + int i, err; + int num = 0; + int len = 0; + int block_set = 0; + int extra_block = 0; + + if (!sb) { + printk(KERN_ERR "ext4 defrag: Non-existent device\n"); + return -ENOSPC; + } + + group_no = (org_inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + start = ext_info->g_offset; + end = EXT4_BLOCKS_PER_GROUP(sb) - 1; + + /* We consider about the boot block if bs = 1k */ + if (sb->s_blocksize == 1024) + extra_block = 1; + + handle = ext4_journal_start(org_inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + return err; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group_no); + if (!bitmap_bh) { + err = -EIO; + goto out; + } + + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + err = ext4_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto out; + + for (i = start; i <= end ; i++) { + if (bitmap_search_next_usable_block(i, bitmap_bh, i + 1) >= 0) { + len++; + /* + * Reset start_block if the free block is + * the head of region. + */ + if (!block_set) { + start_block = + i + group_no * EXT4_BLOCKS_PER_GROUP(sb) + + extra_block; + block_set = 1; + } + } else if (len) { + ext_info->ext[num].start = start_block; + ext_info->ext[num].len = len; + num++; + len = 0; + block_set = 0; + if (num == ext_info->max_entries) { + ext_info->g_offset = i + 1; + break; + } + } + if (i == end && len) { + ext_info->ext[num].start = start_block; + ext_info->ext[num].len = len; + num++; + } + } + + ext_info->entries = num; +out: + ext4_journal_release_buffer(handle, bitmap_bh); + brelse(bitmap_bh); + + if (handle) + ext4_journal_stop(handle); + + return err; +} + +int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL || + cmd == EXT4_IOC_FIBMAP)) { + printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents " + "based file\n", inode->i_ino); + return -EOPNOTSUPP; + } + + if (cmd == EXT4_IOC_FIBMAP) { + ext4_fsblk_t __user *p = (ext4_fsblk_t __user *)arg; + ext4_fsblk_t block = 0; + struct address_space *mapping = filp->f_mapping; + + if (copy_from_user(&block, (ext4_fsblk_t __user *)arg, + sizeof(block))) + return -EFAULT; + + block = ext4_bmap(mapping, block); + + return put_user(block, p); + } else if (cmd == EXT4_IOC_GROUP_INFO) { + struct ext4_group_data_info grp_data; + + if (copy_from_user(&grp_data, + (struct ext4_group_data_info __user *)arg, + sizeof(grp_data))) + return -EFAULT; + + grp_data.s_blocks_per_group = + EXT4_BLOCKS_PER_GROUP(inode->i_sb); + grp_data.s_inodes_per_group = + EXT4_INODES_PER_GROUP(inode->i_sb); + + if (copy_to_user((struct ext4_group_data_info __user *)arg, + &grp_data, sizeof(grp_data))) + return -EFAULT; + } else if (cmd == EXT4_IOC_FREE_BLOCKS_INFO) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + BUG_ON(ext_info.ino != inode->i_ino); + + err = ext4_defrag_fblocks_distribution(inode, &ext_info); + + if (!err) + err = copy_to_user( + (struct ext4_extents_info __user *)arg, + &ext_info, sizeof(ext_info)); + } else if (cmd == EXT4_IOC_EXTENTS_INFO) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + err = ext4_defrag_extents_info(inode->i_sb, &ext_info); + if (err >= 0) { + if (copy_to_user((struct ext4_extents_info __user *)arg, + &ext_info, sizeof(ext_info))) + return -EFAULT; + } + } else if (cmd == EXT4_IOC_RESERVE_BLOCK) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + err = ext4_defrag_reserve_fblocks(inode, &ext_info); + } else if (cmd == EXT4_IOC_MOVE_VICTIM) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + err = ext4_defrag_move_victim(filp, &ext_info); + + } else if (cmd == EXT4_IOC_BLOCK_RELEASE) { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_reservation(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } else if (cmd == EXT4_IOC_DEFRAG) { + struct ext4_ext_defrag_data defrag; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + + if (!capable(CAP_DAC_OVERRIDE)) { + if ((inode->i_mode & S_IRUSR) != S_IRUSR) + return -EACCES; + if (current->fsuid != inode->i_uid) + return -EACCES; + } + + if (copy_from_user(&defrag, + (struct ext4_ext_defrag_data __user *)arg, + sizeof(defrag))) + return -EFAULT; + + /* Check goal offset if goal offset was given from userspace */ + if (defrag.goal != -1 && + ext4_blocks_count(es) <= defrag.goal) { + printk(KERN_ERR "ext4 defrag: Invalid goal offset" + " %llu, you can set goal offset up to %llu\n", + defrag.goal, ext4_blocks_count(es) - 1); + return -EINVAL; + } + + err = ext4_defrag(filp, defrag.start_offset, + defrag.defrag_size, defrag.goal, defrag.flag, + &defrag.ext); + } + + return err; +} + +/** + * ext4_defrag_merge_across_blocks - Merge extents across leaf block + * + * @handle: journal handle + * @org_inode: original inode + * @o_start: first original extent to be defraged + * @o_end: last original extent to be defraged + * @start_ext: first new extent to be merged + * @new_ext: middle of new extent to be merged + * @end_ext: last new extent to be merged + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext, int phase) +{ + struct ext4_ext_path *org_path = NULL; + ext4_lblk_t eblock = 0; + int new_flag = 0; + int end_flag = 0; + int defrag_flag; + int err; + + if (phase == DEFRAG_FORCE_VICTIM) + defrag_flag = 1; + else + defrag_flag = 0; + + if (le16_to_cpu(start_ext->ee_len) && + le16_to_cpu(new_ext->ee_len) && + le16_to_cpu(end_ext->ee_len)) { + + if (o_start == o_end) { + + /* start_ext new_ext end_ext + * dest |---------|-----------|--------| + * org |------------------------------| + */ + + end_flag = 1; + } else { + + /* start_ext new_ext end_ext + * dest |---------|----------|---------| + * org |---------------|--------------| + */ + + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + } + + o_start->ee_len = start_ext->ee_len; + new_flag = 1; + + } else if (le16_to_cpu(start_ext->ee_len) && + le16_to_cpu(new_ext->ee_len) && + !le16_to_cpu(end_ext->ee_len) && + o_start == o_end) { + + /* start_ext new_ext + * dest |--------------|---------------| + * org |------------------------------| + */ + + o_start->ee_len = start_ext->ee_len; + new_flag = 1; + + } else if (!le16_to_cpu(start_ext->ee_len) && + le16_to_cpu(new_ext->ee_len) && + le16_to_cpu(end_ext->ee_len) && + o_start == o_end) { + + /* new_ext end_ext + * dest |--------------|---------------| + * org |------------------------------| + */ + + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + + /* + * Set 0 to the extent block if new_ext was + * the first block. + */ + if (!new_ext->ee_block) + eblock = 0; + else + eblock = le32_to_cpu(new_ext->ee_block); + + new_flag = 1; + } else { + printk(KERN_ERR "ext4 defrag: Unexpected merge case\n"); + return -EIO; + } + + if (new_flag) { + org_path = ext4_ext_find_extent(org_inode, eblock, NULL); + if (IS_ERR(org_path)) { + err = PTR_ERR(org_path); + org_path = NULL; + goto out; + } + err = ext4_ext_insert_extent_defrag(handle, org_inode, + org_path, new_ext, defrag_flag); + if (err) + goto out; + } + + if (end_flag) { + org_path = ext4_ext_find_extent(org_inode, + le32_to_cpu(end_ext->ee_block) - 1, org_path); + if (IS_ERR(org_path)) { + err = PTR_ERR(org_path); + org_path = NULL; + goto out; + } + err = ext4_ext_insert_extent_defrag(handle, org_inode, + org_path, end_ext, defrag_flag); + if (err) + goto out; + } +out: + if (org_path) { + ext4_ext_drop_refs(org_path); + kfree(org_path); + } + + return err; + +} + +/** + * ext4_defrag_merge_inside_block - Merge new extent to the extent block + * + * @o_start: first original extent to be merged + * @o_end: last original extent to be merged + * @start_ext: first new extent to be merged + * @new_ext: middle of new extent to be merged + * @end_ext: last new extent to be merged + * @eh: extent header of target leaf block + * @replaced: the number of blocks which will be replaced with new_ext + * @range_to_move: used to decide how to merge + * + * This function always returns 0. + */ +static int +ext4_defrag_merge_inside_block(struct ext4_extent *o_start, + struct ext4_extent *o_end, struct ext4_extent *start_ext, + struct ext4_extent *new_ext, struct ext4_extent *end_ext, + struct ext4_extent_header *eh, ext4_fsblk_t replaced, + int range_to_move) +{ + int i = 0; + unsigned len; + + /* Move the existing extents */ + if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { + len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - + (unsigned long)(o_end + 1); + memmove(o_end + 1 + range_to_move, o_end + 1, len); + } + + /* Insert start entry */ + if (le16_to_cpu(start_ext->ee_len)) + o_start[i++].ee_len = start_ext->ee_len; + + /* Insert new entry */ + if (le16_to_cpu(new_ext->ee_len)) { + o_start[i].ee_block = new_ext->ee_block; + o_start[i].ee_len = cpu_to_le16(replaced); + ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); + } + + /* Insert end entry */ + if (end_ext->ee_len) + o_start[i] = *end_ext; + + /* Increment the total entries counter on the extent block */ + le16_add_cpu(&eh->eh_entries, range_to_move); + + return 0; +} + +/** + * ext4_defrag_merge_extents - Merge new extent + * + * @handle: journal handle + * @org_inode: original inode + * @org_path: path indicates first extent to be defraged + * @o_start: first original extent to be defraged + * @o_end: last original extent to be defraged + * @start_ext: first new extent to be merged + * @new_ext: middle of new extent to be merged + * @end_ext: last new extent to be merged + * @replaced: the number of blocks which will be replaced with new_ext + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode, + struct ext4_ext_path *org_path, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext, ext4_fsblk_t replaced, int phase) +{ + struct ext4_extent_header *eh; + unsigned need_slots, slots_range; + int range_to_move, depth, ret; + + /* + * The extents need to be inserted + * start_extent + new_extent + end_extent. + */ + need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) + + (le16_to_cpu(end_ext->ee_len) ? 1 : 0) + + (le16_to_cpu(new_ext->ee_len) ? 1 : 0); + + /* The number of slots between start and end */ + slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) + / sizeof(struct ext4_extent); + + /* Range to move the end of extent */ + range_to_move = need_slots - slots_range; + depth = org_path->p_depth; + org_path += depth; + eh = org_path->p_hdr; + + if (depth) { + /* Register to journal */ + ret = ext4_journal_get_write_access(handle, org_path->p_bh); + if (ret) + return ret; + } + + /* Expansion */ + if (range_to_move > 0 && + (range_to_move > le16_to_cpu(eh->eh_max) + - le16_to_cpu(eh->eh_entries))) { + + ret = ext4_defrag_merge_across_blocks(handle, org_inode, + o_start, o_end, start_ext, new_ext, + end_ext, phase); + if (ret < 0) + return ret; + } else { + ret = ext4_defrag_merge_inside_block(o_start, o_end, + start_ext, new_ext, end_ext, eh, + replaced, range_to_move); + if (ret < 0) + return ret; + } + + if (depth) { + ret = ext4_journal_dirty_metadata(handle, org_path->p_bh); + if (ret) + return ret; + } else { + ret = ext4_mark_inode_dirty(handle, org_inode); + if (ret < 0) + return ret; + } + + return 0; + +} + +/** + * ext4_defrag_leaf_block - Defragmentation for one leaf extent block + * + * @handle: journal handle + * @org_inode: original inode + * @org_path: path indicates first extent to be defraged + * @dext: destination extent + * @from: start offset on the target file + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode, + struct ext4_ext_path *org_path, struct ext4_extent *dext, + ext4_lblk_t *from, int phase) +{ + struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_fsblk_t replaced = 0; + ext4_lblk_t new_end, lblock; + unsigned long depth; + unsigned short len; + ext4_fsblk_t new_phys_end; + int ret; + + depth = ext_depth(org_inode); + start_ext.ee_len = end_ext.ee_len = 0; + o_start = o_end = oext = org_path[depth].p_ext; + ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); + new_ext.ee_len = dext->ee_len; + len = le16_to_cpu(new_ext.ee_len); + new_ext.ee_block = cpu_to_le32(*from); + lblock = le32_to_cpu(oext->ee_block); + new_end = le32_to_cpu(new_ext.ee_block) + + le16_to_cpu(new_ext.ee_len) - 1; + new_phys_end = ext_pblock(&new_ext) + + le16_to_cpu(new_ext.ee_len) - 1; + + /* + * First original extent + * dest |---------------| + * org |---------------| + */ + if (le32_to_cpu(new_ext.ee_block) > + le32_to_cpu(oext->ee_block) && + le32_to_cpu(new_ext.ee_block) < + le32_to_cpu(oext->ee_block) + + le16_to_cpu(oext->ee_len)) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) + - le32_to_cpu(oext->ee_block)); + replaced += le16_to_cpu(oext->ee_len) + - le16_to_cpu(start_ext.ee_len); + } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) { + /* We can merge previous extent. */ + prev_ext = oext - 1; + if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len)) + == ext_pblock(&new_ext)) + && (le32_to_cpu(prev_ext->ee_block) + + le16_to_cpu(prev_ext->ee_len) + == le32_to_cpu(new_ext.ee_block))) { + o_start = prev_ext; + start_ext.ee_len = cpu_to_le16( + le16_to_cpu(prev_ext->ee_len) + + le16_to_cpu(new_ext.ee_len)); + new_ext.ee_len = 0; + } + } + + for (;;) { + /* The extent for destination must be found. */ + BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block)); + lblock += le16_to_cpu(oext->ee_len); + + /* + * Middle of original extent + * dest |-------------------| + * org |-----------------| + */ + if (le32_to_cpu(new_ext.ee_block) <= + le32_to_cpu(oext->ee_block) && + new_end >= le32_to_cpu(oext->ee_block) + + le16_to_cpu(oext->ee_len) - 1) + replaced += le16_to_cpu(oext->ee_len); + + /* + * Last original extent + * dest |----------------| + * org |---------------| + */ + if (new_end >= le32_to_cpu(oext->ee_block) && + new_end < le32_to_cpu(oext->ee_block) + + le16_to_cpu(oext->ee_len) - 1) { + end_ext.ee_len + = cpu_to_le16(le32_to_cpu(oext->ee_block) + + le16_to_cpu(oext->ee_len) - 1 - new_end); + ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end) + + le16_to_cpu(oext->ee_len) + - le16_to_cpu(end_ext.ee_len))); + end_ext.ee_block + = cpu_to_le32(le32_to_cpu(o_end->ee_block) + + le16_to_cpu(oext->ee_len) + - le16_to_cpu(end_ext.ee_len)); + replaced += le16_to_cpu(oext->ee_len) + - le16_to_cpu(end_ext.ee_len); + } + + /* + * Detected the block end, reached the number of replaced + * blocks to dext->ee_len. Then merge the extent. + */ + if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) || + new_end <= le32_to_cpu(oext->ee_block) + + le16_to_cpu(oext->ee_len) - 1) { + ret = ext4_defrag_merge_extents(handle, org_inode, + org_path, o_start, o_end, &start_ext, + &new_ext, &end_ext, replaced, phase); + if (ret < 0) + return ret; + + /* All expected blocks are replaced */ + if (le16_to_cpu(new_ext.ee_len) <= 0) + return 0; + + /* Re-calculate new_ext */ + le16_add_cpu(&new_ext.ee_len, -replaced); + le32_add_cpu(&new_ext.ee_block, replaced); + ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext) + + replaced); + replaced = 0; + start_ext.ee_len = end_ext.ee_len = 0; + o_start = NULL; + + /* All expected blocks are replaced. */ + if (le16_to_cpu(new_ext.ee_len) <= 0) + return 0; + } + + /* Get the next extent for original. */ + if (org_path) + ext4_ext_drop_refs(org_path); + org_path = ext4_ext_find_extent(org_inode, lblock, org_path); + if (IS_ERR(org_path)) { + ret = PTR_ERR(org_path); + org_path = NULL; + return ret; + } + depth = ext_depth(org_inode); + oext = org_path[depth].p_ext; + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len) + <= lblock) + return -ENOENT; + + o_end = oext; + if (!o_start) + o_start = oext; + } +} + +/** + * ext4_defrag_replace_branches - Replace original extents with new extents + * + * @handle: journal handle + * @org_inode: original inode + * @dest_inode: temporary inode + * @from_page: page offset of org_inode + * @dest_from_page: page offset of dest_inode + * @count_page: page count to be replaced + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + * Replace extents for blocks from "from" to "from + count - 1". + */ +static int +ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode, + struct inode *dest_inode, pgoff_t from_page, + pgoff_t dest_from_page, pgoff_t count_page, int phase) +{ + struct ext4_ext_path *org_path = NULL; + struct ext4_ext_path *dest_path = NULL; + struct ext4_extent *oext, *dext, *swap_ext; + struct ext4_extent tmp_ext, tmp_ext2; + ext4_lblk_t from, count, dest_off, diff, org_diff; + int err = 0; + int depth; + int replaced_count = 0; + + from = (ext4_lblk_t)from_page << + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits); + count = (ext4_lblk_t)count_page << + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits); + dest_off = (ext4_lblk_t)dest_from_page << + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits); + + /* Get the original extent for the block "from" */ + org_path = ext4_ext_find_extent(org_inode, from, NULL); + if (IS_ERR(org_path)) { + err = PTR_ERR(org_path); + org_path = NULL; + goto out; + } + + /* Get the destination extent for the head */ + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL); + if (IS_ERR(dest_path)) { + err = PTR_ERR(dest_path); + dest_path = NULL; + goto out; + } + depth = ext_depth(dest_inode); + dext = dest_path[depth].p_ext; + /* When dext is too large, pick up the target range. */ + diff = dest_off - le32_to_cpu(dext->ee_block); + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff); + tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff); + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff); + if (count < le16_to_cpu(tmp_ext.ee_len)) + tmp_ext.ee_len = cpu_to_le16(count); + dext = &tmp_ext; + + depth = ext_depth(org_inode); + oext = org_path[depth].p_ext; + org_diff = from - le32_to_cpu(oext->ee_block); + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff); + tmp_ext2.ee_block = tmp_ext.ee_block; + + /* Adjust extent length when blocksize != pagesize */ + if (le16_to_cpu(tmp_ext.ee_len) <= + le16_to_cpu(oext->ee_len) - org_diff) { + tmp_ext2.ee_len = tmp_ext.ee_len; + } else { + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len) + - org_diff); + tmp_ext.ee_len = tmp_ext2.ee_len; + } + swap_ext = &tmp_ext2; + + /* Loop for the destination extents */ + while (1) { + /* The extent for destination must be found. */ + BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block)); + + /* Loop for the original extent blocks */ + err = ext4_defrag_leaf_block(handle, org_inode, + org_path, dext, &from, phase); + if (err < 0) + goto out; + + /* + * We need the function which fixes extent information for + * inserting. + * e.g. ext4_defrag_merge_extents() + */ + err = ext4_defrag_leaf_block(handle, dest_inode, + dest_path, swap_ext, &dest_off, -1); + if (err < 0) + goto out; + + replaced_count += le16_to_cpu(dext->ee_len); + dest_off += le16_to_cpu(dext->ee_len); + from += le16_to_cpu(dext->ee_len); + + /* Already moved the expected blocks */ + if (replaced_count >= count) + break; + + if (org_path) + ext4_ext_drop_refs(org_path); + org_path = ext4_ext_find_extent(org_inode, from, NULL); + if (IS_ERR(org_path)) { + err = PTR_ERR(org_path); + org_path = NULL; + goto out; + } + depth = ext_depth(org_inode); + oext = org_path[depth].p_ext; + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len) + <= from) { + err = 0; + goto out; + } + + if (dest_path) + ext4_ext_drop_refs(dest_path); + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL); + if (IS_ERR(dest_path)) { + err = PTR_ERR(dest_path); + dest_path = NULL; + goto out; + } + depth = ext_depth(dest_inode); + dext = dest_path[depth].p_ext; + if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len) + <= dest_off) { + err = 0; + goto out; + } + + /* When dext is too large, pick up the target range. */ + diff = dest_off - le32_to_cpu(dext->ee_block); + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff); + tmp_ext.ee_block = + cpu_to_le32(le32_to_cpu(dext->ee_block) + diff); + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff); + + if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len)) + tmp_ext.ee_len = cpu_to_le16(count - replaced_count); + + dext = &tmp_ext; + + org_diff = from - le32_to_cpu(oext->ee_block); + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff); + tmp_ext2.ee_block = tmp_ext.ee_block; + + /* Adjust extent length when blocksize != pagesize */ + if (le16_to_cpu(tmp_ext.ee_len) <= + le16_to_cpu(oext->ee_len) - org_diff) { + tmp_ext2.ee_len = tmp_ext.ee_len; + } else { + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len) + - org_diff); + tmp_ext.ee_len = tmp_ext2.ee_len; + } + swap_ext = &tmp_ext2; + } + +out: + if (org_path) { + ext4_ext_drop_refs(org_path); + kfree(org_path); + } + if (dest_path) { + ext4_ext_drop_refs(dest_path); + kfree(dest_path); + } + + return err; +} + +/** + * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode + * + * @org_inode: original inode + * @dest_inode: temporary inode + * @ar: allocation request for multiple block allocation + * @org_path: indicating the original inode's extent + * @dest_path: indicating the temporary inode's extent + * @req_blocks: contiguous blocks count we need + * @iblock: target file offset + * @goal: goal offset + * @phase: phase of the force defrag mode + * + */ +static void +ext4_defrag_fill_ar(struct inode *org_inode, struct inode *dest_inode, + struct ext4_allocation_request *ar, + struct ext4_ext_path *org_path, + struct ext4_ext_path *dest_path, + ext4_fsblk_t req_blocks, ext4_lblk_t iblock, + ext4_fsblk_t goal, int phase) +{ + ext4_group_t org_grp_no; + ext4_grpblk_t org_blk_off; + int org_depth = ext_depth(org_inode); + + if (phase == DEFRAG_FORCE_VICTIM) { + ext4_get_group_no_and_offset(org_inode->i_sb, + ext_pblock(org_path[org_depth].p_ext), + &org_grp_no, &org_blk_off); + ar->excepted_group = org_grp_no; + } else { + /* Allocate contiguous blocks to any block group */ + ar->excepted_group = -1; + } + + ar->inode = dest_inode; + ar->len = req_blocks; + ar->logical = iblock; + ar->flags = EXT4_MB_HINT_DATA | EXT4_MB_HINT_RESERVED + | EXT4_MB_HINT_NOPREALLOC; + ar->lleft = 0; + ar->pleft = 0; + ar->lright = 0; + ar->pright = 0; + + if (goal) + ar->goal = goal; + else + ar->goal = ext4_ext_find_goal(dest_inode, dest_path, iblock); +} + +/** + * ext4_defrag_alloc_blocks - Allocate contiguous blocks to temporary inode + * + * @handle: journal handle + * @org_inode: original inode + * @dest_inode: temporary inode for multiple block allocation + * @ar: allocation request for multiple block allocation + * @dest_path: indicating the temporary inode's extent + * @newblock: start offset of contiguous blocks + * + * This function returns 0 if succeeed, otherwise returns error value. + */ +static int +ext4_defrag_alloc_blocks(handle_t *handle, struct inode *org_inode, + struct inode *dest_inode, struct ext4_allocation_request *ar, + struct ext4_ext_path *dest_path, ext4_fsblk_t *newblock) +{ + struct super_block *sb = org_inode->i_sb; + struct buffer_head *bh = NULL; + int err, i, credits = 0; + + credits = ext4_ext_calc_credits_for_insert(dest_inode, dest_path); + err = ext4_ext_journal_restart(handle, + credits + EXT4_TRANS_META_BLOCKS); + if (err) + return err; + + *newblock = ext4_mb_new_blocks(handle, ar, &err); + if (err) + return err; + + /* + * Dirty buffer_head causes the overwriting + * if ext4_mb_new_blocks() allocates the block + * which used to be the metadata block. + * We should call unmap_underlying_metadata() + * to clear the dirty flag. + */ + for (i = 0; i < ar->len; i++) { + bh = sb_find_get_block(sb, *newblock + i); + unmap_underlying_metadata(sb->s_bdev, *newblock + i); + } + + return err; +} + +/** + * ext4_defrag_check_phase + * - Check condition of the allocated blocks (only force defrag mode) + * + * @ar: allocation request for multiple block allocation + * @dest_grp_no: block group num of the allocated blocks + * @goal_grp_no: block group num of the destination of block allocation + * @alloc_total: sum total of the allocated blocks + * @req_blocks: contiguous blocks count we need + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_check_phase(struct ext4_allocation_request *ar, + ext4_group_t dest_grp_no, ext4_group_t goal_grp_no, + ext4_fsblk_t alloc_total, ext4_lblk_t req_blocks, + int phase) +{ + int err = 0; + + switch (phase) { + case DEFRAG_FORCE_TRY: + /* If there is not enough space, return -ENOSPC. */ + if (ar->len != req_blocks) + /* -ENOSPC triggers DEFRAG_FORCE_VICTIM phase. */ + err = -ENOSPC; + break; + case DEFRAG_FORCE_VICTIM: + /* We can't allocate new blocks in the same block group. */ + if (dest_grp_no == ar->excepted_group) { + printk(KERN_ERR "ext4 defrag: Failed to allocate" + " victim file to other block group\n"); + err = -ENOSPC; + } + break; + case DEFRAG_FORCE_GATHER: + /* Maybe reserved blocks are already used by other process. */ + if (dest_grp_no != goal_grp_no + || alloc_total != req_blocks) { + printk(KERN_ERR "ext4 defrag: Reserved blocks are" + " already used by other process\n"); + err = -EIO; + } + break; + } + + return err; +} + +/** + * ext4_defrag_partial - Defrag a file per page + * + * @tmp_inode: temporary inode + * @filp: pointer to file + * @org_offset: page index on original file + * @dest_offset: page index on temporary file + * @phase: phase of the force defrag mode + * + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_partial(struct inode *tmp_inode, struct file *filp, + pgoff_t org_offset, pgoff_t dest_offset, int phase) +{ + struct inode *org_inode = filp->f_dentry->d_inode; + struct address_space *mapping = org_inode->i_mapping; + struct buffer_head *bh; + struct page *page; + const struct address_space_operations *a_ops = mapping->a_ops; + handle_t *handle; + pgoff_t offset_in_page = PAGE_SIZE; + int ret, i, jblocks, blocks_per_page; + int blocksize = org_inode->i_sb->s_blocksize; + long long offs = org_offset << PAGE_CACHE_SHIFT; + unsigned long blk_off = 0; + unsigned int w_flags = 0; + void *fsdata; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and tmp_inode may change each different metadata blocks. + */ + jblocks = ext4_writepage_trans_blocks(org_inode) * 2; + handle = ext4_journal_start(org_inode, jblocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + + if (segment_eq(get_fs(), KERNEL_DS)) + w_flags |= AOP_FLAG_UNINTERRUPTIBLE; + + if (org_offset == ((org_inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { + offset_in_page = (org_inode->i_size & (PAGE_CACHE_SIZE - 1)); + /* + * Set PAGE_CACHE_SIZE to offset_in_page not be 0 + * if org_offset is the last page and i_size is + * multiples of PAGE_CACHE_SIZE. + */ + if (offset_in_page == 0) + offset_in_page = PAGE_CACHE_SIZE; + } + + up_write(&EXT4_I(org_inode)->i_data_sem); + ret = a_ops->write_begin(filp, mapping, offs, + offset_in_page, w_flags, &page, &fsdata); + down_write(&EXT4_I(org_inode)->i_data_sem); + + if (unlikely(ret < 0)) + goto out; + + if (!PageUptodate(page)) { + mapping->a_ops->readpage(filp, page); + lock_page(page); + } + + /* + * try_to_release_page() doesn't call relasepage in writeback mode. + * We should care about the order of writing to the same file + * by multiple defrag processes. + * It needs to call wait_on_page_writeback() to wait for the + * writeback of the page. + */ + if (PageWriteback(page)) + wait_on_page_writeback(page); + + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode, + org_offset, dest_offset, 1, phase); + + if (ret < 0) + goto out; + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(org_inode); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << org_inode->i_blkbits, 0); + + blocks_per_page = PAGE_SIZE / blocksize; + blk_off = org_offset * blocks_per_page; + + bh = page_buffers(page); + for (i = 0; i < blocks_per_page; i++) { + up_write(&EXT4_I(org_inode)->i_data_sem); + ret = ext4_get_block(org_inode, blk_off++, bh, 0); + down_write(&EXT4_I(org_inode)->i_data_sem); + + if (ret < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + + ret = a_ops->write_end(filp, mapping, offs, offset_in_page, + offset_in_page, page, fsdata); + + if (unlikely(ret < 0)) + goto out; +out: + ext4_journal_stop(handle); + + return (ret < 0 ? ret : 0); +} + +/** + * ext4_defrag_comp_ext_count- Check whether fragments are improved or not + * + * @org_inode: original inode + * @path: the structure holding some info about + * original extent tree + * @tar_end: the last block number of the allocated blocks + * @sum_tmp: the extents count in the allocated blocks + * @goal: block offset for allocaton + * @phase: phase of the force defrag mode + * + * + * This function returns the values as below. + * 0 (improved) + * 1 (not improved) + * negative value (error case) + */ +static int +ext4_defrag_comp_ext_count(struct inode *org_inode, + struct ext4_ext_path *org_path, ext4_lblk_t tar_end, + int sum_tmp, ext4_fsblk_t goal, int phase) +{ + struct ext4_extent *ext = NULL; + int depth = ext_depth(org_inode); + int last_extent = 0; + int sum_org = 0; + int ret = 0; + + ext = org_path[depth].p_ext; + + /* + * Compare the number of the newly allocated extents to + * that of existing one. + */ + while (1) { + if (!last_extent) + ++sum_org; + if (tar_end <= (le32_to_cpu(ext->ee_block) + + le16_to_cpu(ext->ee_len) - 1) || + last_extent) { + /* + * Fail if goal is not set and the fragmentation + * is not improved. + */ + if (sum_org == sum_tmp && !goal) { + /* Not improved */ + ret = 1; + } else if (sum_org < sum_tmp && + phase != DEFRAG_FORCE_VICTIM) { + /* Fragment increased */ + ret = -ENOSPC; + printk(KERN_ERR "ext4 defrag: " + "Insufficient free blocks\n"); + } + break; + } + last_extent = + ext4_defrag_next_extent(org_inode, org_path, &ext); + if (last_extent < 0) { + ret = last_extent; + break; + } + } + + return ret; +} + +/** + * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree + * + * @org_inode: original inode + * @tmp_inode: temporary inode + * @org_path: indicating the original inode's extent + * @tar_start: starting offset to allocate in blocks + * @tar_blocks: the number of blocks to allocate + * @iblock: file related offset + * @goal: block offset for allocaton + * @phase: phase of the force defrag mode + * + * + * This function returns the value as below: + * 0 (succeed) + * 1 (not improved) + * negative value (error case) + */ +static int +ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode, + struct ext4_ext_path *org_path, ext4_lblk_t tar_start, + ext4_lblk_t tar_blocks, ext4_lblk_t iblock, + ext4_fsblk_t goal, int phase) +{ + handle_t *handle; + struct ext4_extent_header *eh = NULL; + struct ext4_allocation_request ar; + struct ext4_ext_path *dest_path = NULL; + struct ext4_extent newex; + ext4_fsblk_t alloc_total = 0; + ext4_fsblk_t newblock = 0; + ext4_lblk_t tar_end = tar_start + tar_blocks - 1; + ext4_group_t dest_group_no, goal_group_no; + ext4_grpblk_t dest_blk_off, goal_blk_off; + int sum_tmp = 0; + int metadata = 1; + int ret, ret2; + + eh = ext_inode_hdr(tmp_inode); + eh->eh_depth = 0; + + dest_path = ext4_ext_find_extent(tmp_inode, iblock, NULL); + if (IS_ERR(dest_path)) { + ret = PTR_ERR(dest_path); + dest_path = NULL; + goto out2; + } + + /* Fill struct ext4_allocation_request with necessary info */ + ext4_defrag_fill_ar(org_inode, tmp_inode, &ar, org_path, + dest_path, tar_blocks, iblock, goal, phase); + + handle = ext4_journal_start(tmp_inode, 0); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out2; + } + + ext4_get_group_no_and_offset(tmp_inode->i_sb, goal, + &goal_group_no, &goal_blk_off); + + while (alloc_total != tar_blocks) { + /* Allocate blocks */ + ret = ext4_defrag_alloc_blocks(handle, org_inode, tmp_inode, + &ar, dest_path, &newblock); + if (ret < 0) + goto out; + + ext4_get_group_no_and_offset(tmp_inode->i_sb, newblock, + &dest_group_no, &dest_blk_off); + + alloc_total += ar.len; + + /* the checks that done in force mode */ + if (phase) { + ret = ext4_defrag_check_phase(&ar, dest_group_no, + goal_group_no, alloc_total, + tar_blocks, phase); + if (ret < 0) + goto out; + } + + newex.ee_block = cpu_to_le32(alloc_total - ar.len); + ext4_ext_store_pblock(&newex, newblock); + newex.ee_len = cpu_to_le16(ar.len); + + ret = ext4_ext_insert_extent(handle, tmp_inode, + dest_path, &newex); + if (ret < 0) + goto out; + + if (!phase) + ar.goal = newblock + ar.len; + ar.len = tar_blocks - alloc_total; + sum_tmp++; + } + + ret = ext4_defrag_comp_ext_count(org_inode, org_path, tar_end, + sum_tmp, goal, phase); + +out: + if (ret < 0 || ret == 1) { + if (ar.len) + ext4_free_blocks(handle, tmp_inode, newblock, ar.len, + metadata); + /* Faild case: We have to remove halfway blocks */ + ret2 = ext4_ext_remove_space(tmp_inode, 0); + if (ret2) { + printk(KERN_ERR "ext4 defrag: " + "Failed to remove temporary inode blocks\n"); + ret = ret2; + } + } + + ext4_journal_stop(handle); + +out2: + if (dest_path) { + ext4_ext_drop_refs(dest_path); + kfree(dest_path); + } + + return ret; +} + +/** + * ext4_defrag_check - Check the enviroment whether a defrag can be done + * + * @org_inode: original inode + * @ext: extent to be moved (only defrag force mode) + * @defrag_size: size of defrag in blocks + * @goal: poiter to block offset for allocation + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_check(struct inode *org_inode, struct ext4_extent_data *ext, + ext4_lblk_t defrag_size, ext4_fsblk_t *goal, int *phase) +{ + + /* ext4 online defrag supports only 4KB block size */ + if (org_inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) { + printk(KERN_ERR "ext4 defrag: ext4 online defrag supports " + "only 4KB block size for the moment.\n"); + return -EOPNOTSUPP; + } + + /* ext4 online defrag needs mballoc mount option. */ + if (!test_opt(org_inode->i_sb, MBALLOC)) { + printk(KERN_ERR "ext4 defrag: multiblock allocation " + "is disabled\n"); + return -EOPNOTSUPP; + } + + if (ext->len) { + /* Setup for the force defrag mode */ + if (ext->len < defrag_size) { + printk(KERN_ERR "ext4 defrag: " + "Invalid length of extent\n"); + return -EINVAL; + } + *phase = DEFRAG_FORCE_GATHER; + *goal = ext->start; + } + + return 0; +} + +/** + * ext4_defrag_init_tmp_inode - Create a temporary inode + * + * @org_inode: original inode + * + * This function returns pointer to the struct inode if succeed, + * otherwise returns error value. + */ +static struct inode * +ext4_defrag_init_tmp_inode(struct inode *org_inode) +{ + handle_t *handle; + struct inode *tmp_inode; + + handle = ext4_journal_start(org_inode, + EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 + + 2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb)); + if (IS_ERR(handle)) + /* Return error code */ + return (struct inode *)handle; + + tmp_inode = ext4_new_inode(handle, + org_inode->i_sb->s_root->d_inode, S_IFREG); + if (IS_ERR(tmp_inode)) + goto out; + + i_size_write(tmp_inode, i_size_read(org_inode)); + tmp_inode->i_nlink = 0; + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); + +out: + ext4_journal_stop(handle); + + return tmp_inode; +} + +/** + * ext4_defrag - Defrag the specified range of a file + * + * If no-option is specified, ext4_defrag() proceeds the following order. + * 1.ext4_defrag() calculates the block number where defrag terminates + * by the start block number(defrag_start) and the size of defraged data + * (defrag_size) specified as arguments. + * If the defrag_start points a hole, the extent's start offset pointed by + * ext_cur(current extent), holecheck_path, org_path are set after + * hole behind. + * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent + * or the ext_cur exceeds the block_end which is last logical block number. + * 3.To get a length of continues area, call ext4_defrag_next_extent() + * specified with the ext_cur(initial value is holecheck_path) re-cursive, + * until find un-continuous extent, the start logical block number exceeds + * the block_end or the extent points to the last extent. + * 4.After determining the length of continuous block, + * allocates continuous blocks to a temporary inode + * by ext4_defrag_new_extent_tree(). + * 5.Exchange the original inode data with temporary inode data + * from page_offset to seq_end_page by page unit. + * The start page index of data are specified as arguments: + * the original inode is page_offset, the temporary inode is dest_offset. + * 6.Update holecheck_path and org_path to points a next proceeding extent, + * and release the temporary inode holding the original fragmented data. + * Then, returns to step 2. + * 7.Release holecheck_path, org_path and temporary inode, + * and returns the defrag_size which is the size of defraged data. + * The defrag_size is used for the command to calculate the file offset + * where a next defrag processing start. + * (Since the defrag command calls defrag_ioctl() by 64MB unit, + * a file bigger than 64MB calls defrag_ioctl many times.) + * + * @filp: pointer to file + * @block_start: starting offset to defrag in blocks + * @defrag_size: size of defrag in blocks + * @goal: block offset for allocation + * @phase: phase of the force defrag mode + * @ext: extent to be moved (only defrag force mode) + * + * This function returns the number of blocks if succeed, otherwise + * returns error value. + */ +int +ext4_defrag(struct file *filp, ext4_lblk_t block_start, + ext4_lblk_t defrag_size, ext4_fsblk_t goal, int phase, + struct ext4_extent_data *ext) +{ + struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL; + struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL; + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + pgoff_t page_offset, seq_end_page, dest_offset; + int ret, depth, seq_extents, last_extent = 0; + + /* Check the filesystem enviroment whether defrag can be done */ + ret = ext4_defrag_check(org_inode, ext, defrag_size, &goal, &phase); + if (ret < 0) + return ret; + + file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits; + block_end = block_start + defrag_size - 1; + if (file_end < block_end) + defrag_size -= block_end - file_end; + + mutex_lock(&org_inode->i_mutex); + down_write(&EXT4_I(org_inode)->i_data_sem); + + org_path = ext4_ext_find_extent(org_inode, block_start, NULL); + if (IS_ERR(org_path)) { + ret = PTR_ERR(org_path); + org_path = NULL; + goto out; + } + + /* Get path structure to check the hole */ + holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL); + if (IS_ERR(holecheck_path)) { + ret = PTR_ERR(holecheck_path); + holecheck_path = NULL; + goto out; + } + + depth = ext_depth(org_inode); + ext_cur = holecheck_path[depth].p_ext; + if (ext_cur == NULL) + goto out; + + /* + * Get proper extent whose ee_block is beyond block_start + * if block_start was within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + le16_to_cpu(ext_cur->ee_len) - 1 < block_start) { + last_extent = ext4_defrag_next_extent(org_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret = last_extent; + goto out; + } + last_extent = ext4_defrag_next_extent(org_inode, org_path, + &ext_dummy); + if (last_extent < 0) { + ret = last_extent; + goto out; + } + } + seq_extents = 1; + seq_start = le32_to_cpu(ext_cur->ee_block); + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + printk(KERN_INFO "ext4 defrag: The specified range of file" + " may be the hole\n"); + goto out; + } + + /* Adjust start blocks */ + add_blocks = min(le32_to_cpu(ext_cur->ee_block) + + le16_to_cpu(ext_cur->ee_len), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + + /* Create a temporary inode to be exchanged data block */ + tmp_inode = ext4_defrag_init_tmp_inode(org_inode); + if (IS_ERR(tmp_inode)) { + ret = PTR_ERR(tmp_inode); + tmp_inode = NULL; + goto out; + } + + /* Adjust tail blocks */ + if (seq_start + seq_blocks - 1 > block_end) + seq_blocks = block_end - seq_start + 1; + + ext_prev = ext_cur; + last_extent = ext4_defrag_next_extent(org_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret = last_extent; + break; + } + if (!last_extent) + seq_extents++; + add_blocks = le16_to_cpu(ext_cur->ee_len); + + /* + * Extend the length of contiguous block (seq_blocks) + * if extents are contiguous. + */ + if (le32_to_cpu(ext_prev->ee_block) + + le16_to_cpu(ext_prev->ee_len) == + le32_to_cpu(ext_cur->ee_block) && + block_end >= le32_to_cpu(ext_cur->ee_block) && + !last_extent) { + if (tmp_inode) { + iput(tmp_inode); + tmp_inode = NULL; + } + continue; + } + + /* Found an isolated block */ + if (seq_extents == 1 && !goal) { + seq_start = le32_to_cpu(ext_cur->ee_block); + goto CLEANUP; + } + + ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode, + org_path, seq_start, seq_blocks, + block_start, goal, phase); + + if (ret < 0) { + break; + } else if (ret == 1 && (!goal || (goal && !phase))) { + ret = 0; + seq_start = le32_to_cpu(ext_cur->ee_block); + goto CLEANUP; + } + + page_offset = seq_start >> + (PAGE_CACHE_SHIFT - org_inode->i_blkbits); + dest_offset = 0; + seq_end_page = (seq_start + seq_blocks - 1) >> + (PAGE_CACHE_SHIFT - org_inode->i_blkbits); + seq_start = le32_to_cpu(ext_cur->ee_block); + + /* + * Discard all preallocations. + * This is provisional solution. + * When true ext4_mb_return_to_preallocation() is + * implemented, this will be removed. + */ + ext4_mb_discard_inode_preallocations(org_inode); + + while (page_offset <= seq_end_page) { + /* Swap original branches with new branches */ + ret = ext4_defrag_partial(tmp_inode, filp, + page_offset, dest_offset, phase); + if (ret < 0) + goto out; + + page_offset++; + dest_offset++; + } + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + holecheck_path = ext4_ext_find_extent(org_inode, + seq_start, holecheck_path); + if (IS_ERR(holecheck_path)) { + ret = PTR_ERR(holecheck_path); + holecheck_path = NULL; + break; + } + depth = holecheck_path->p_depth; + +CLEANUP: + /* Decrease buffer counter */ + if (org_path) + ext4_ext_drop_refs(org_path); + org_path = ext4_ext_find_extent(org_inode, seq_start, org_path); + if (IS_ERR(org_path)) { + ret = PTR_ERR(org_path); + org_path = NULL; + break; + } + + ext_cur = holecheck_path[depth].p_ext; + add_blocks = le16_to_cpu(ext_cur->ee_len); + seq_blocks = 0; + dest_offset = 0; + seq_extents = 1; + + if (tmp_inode) { + iput(tmp_inode); + tmp_inode = NULL; + } + } + +out: + if (org_path) { + ext4_ext_drop_refs(org_path); + kfree(org_path); + } + if (holecheck_path) { + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } + + if (phase == DEFRAG_FORCE_GATHER) + /* Release reserved block in force mode */ + ext4_discard_reservation(org_inode); + + up_write(&EXT4_I(org_inode)->i_data_sem); + mutex_unlock(&org_inode->i_mutex); + + if (tmp_inode) + iput(tmp_inode); + + return (ret ? ret : defrag_size); +} Index: linux-2.6/fs/ext4/ioctl.c =================================================================== --- linux-2.6.orig/fs/ext4/ioctl.c 2008-06-05 13:44:20.526046257 -0500 +++ linux-2.6/fs/ext4/ioctl.c 2008-07-05 12:51:24.553290999 -0500 @@ -241,6 +241,16 @@ setversion_out: return err; } + case EXT4_IOC_FIBMAP: + case EXT4_IOC_DEFRAG: + case EXT4_IOC_GROUP_INFO: + case EXT4_IOC_FREE_BLOCKS_INFO: + case EXT4_IOC_EXTENTS_INFO: + case EXT4_IOC_RESERVE_BLOCK: + case EXT4_IOC_MOVE_VICTIM: + case EXT4_IOC_BLOCK_RELEASE: { + return ext4_defrag_ioctl(inode, filp, cmd, arg); + } case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; struct super_block *sb = inode->i_sb; Index: linux-2.6/fs/ext4/mballoc.h =================================================================== --- linux-2.6.orig/fs/ext4/mballoc.h 2008-06-05 13:44:20.529046073 -0500 +++ linux-2.6/fs/ext4/mballoc.h 2008-07-05 12:51:24.579300477 -0500 @@ -205,6 +205,7 @@ struct ext4_allocation_context { struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; + long long ac_excepted_group; }; #define AC_STATUS_CONTINUE 1 Index: linux-2.6/include/linux/fiemap.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/linux/fiemap.h 2008-07-05 12:51:24.595291345 -0500 @@ -0,0 +1,49 @@ +/* + * FIEMAP ioctl infrastructure. + * + * Copyright (C) 2007 Cluster File Systems, Inc + * + * Author: Kalpak Shah + * Andreas Dilger + */ + +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H + +struct fiemap_extent { + __u64 fe_offset; /* offset in bytes for the start of the extent */ + __u64 fe_length; /* length in bytes for the extent */ + __u32 fe_flags; /* returned FIEMAP_EXTENT_* flags for the extent */ + __u32 fe_lun; /* logical device number for extent (starting at 0)*/ +}; + +struct fiemap { + __u64 fm_start; /* logical starting byte offset (in/out) */ + __u64 fm_length; /* logical length of map (in/out) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_extent_count; /* number of extents in fm_extents (in/out) */ + __u64 fm_end_offset; /* logical offset of end of mapping in last ioctl (out) */ + struct fiemap_extent fm_extents[0]; +}; + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_HSM_READ 0x00000002 /* get data from HSM before map */ +#define FIEMAP_FLAG_NUM_EXTENTS 0x00000004 /* return only number of extents */ +#define FIEMAP_FLAG_INCOMPAT 0xff000000 /* error for unknown flags in here */ + +#define FIEMAP_FLAG_LUN_OFFSET 0x01000000 /* use lun offsets, instead of + * logical file offsets */ + +#define FIEMAP_EXTENT_HOLE 0x00000001 /* has no data or space allocation */ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000002 /* space allocated, but no data */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000004 /* in use, location unknown */ +#define FIEMAP_EXTENT_ERROR 0x00000008 /* mapping error, errno in fe_start*/ +#define FIEMAP_EXTENT_NO_DIRECT 0x00000010 /* cannot access data directly */ +#define FIEMAP_EXTENT_LAST 0x00000020 /* last extent in the file */ +#define FIEMAP_EXTENT_DELALLOC 0x00000040 /* has data but not yet written, + * must have EXTENT_UNKNOWN set */ +#define FIEMAP_EXTENT_SECONDARY 0x00000080 /* data (also) in secondary storage, + * not in primary if EXTENT_UNKNOWN*/ +#define FIEMAP_EXTENT_EOF 0x00000100 /* if fm_start+fm_len is beyond EOF*/ + +#endif /* _LINUX_FIEMAP_H */ Index: linux-2.6/fs/ioctl.c =================================================================== --- linux-2.6.orig/fs/ioctl.c 2008-06-05 13:44:21.026983204 -0500 +++ linux-2.6/fs/ioctl.c 2008-07-05 12:51:24.629290501 -0500 @@ -71,6 +71,34 @@ static int ioctl_fibmap(struct file *fil return put_user(res, p); } +static int ioctl_fiemap(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap_s; + struct inode *inode = filp->f_path.dentry->d_inode; + int error = 0; + + if (!inode->i_op->fiemap) + return -EOPNOTSUPP; + + if (copy_from_user(&fiemap_s, (struct fiemap __user *)arg, + sizeof(struct fiemap))) + return -EFAULT; + + /* Need arg sanity checking: + * start >= 0? Must be; unsigned. + * length > 0? (or is -1 valid?) + * extent count non-zero if not FLAG_NUM_EXTENTS + */ + + /* Should fs do this under a lock? */ + if (fiemap_s.fm_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = inode->i_op->fiemap(inode, arg); + + return error; +} + static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -80,6 +108,8 @@ static int file_ioctl(struct file *filp, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: