Index: linux-2.6/fs/ext4/ext4.h
===================================================================
--- linux-2.6.orig/fs/ext4/ext4.h	2008-06-05 13:44:20.521046407 -0500
+++ linux-2.6/fs/ext4/ext4.h	2008-07-05 12:51:24.698292740 -0500
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
  */
 
 /*
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY		256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL		512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
+
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -94,6 +97,11 @@ struct ext4_allocation_request {
 	unsigned long len;
 	/* flags. see above EXT4_MB_HINT_* */
 	unsigned long flags;
+	/*
+	 * for ext4 online defrag:
+	 * the block group which is excepted from allocation target
+	 */
+	long long excepted_group;
 };
 
 /*
@@ -170,6 +178,15 @@ struct ext4_group_desc
 	__u32	bg_reserved2[3];
 };
 
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	__u32 free_inodes;
+	__u32 free_blocks;
+};
+
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
@@ -289,6 +306,14 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
 #define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_FIBMAP			_IOW('f', 9, ext4_fsblk_t)
+#define EXT4_IOC_DEFRAG		_IOW('f', 10, struct ext4_ext_defrag_data)
+#define EXT4_IOC_GROUP_INFO	_IOW('f', 11, struct ext4_group_data_info)
+#define EXT4_IOC_FREE_BLOCKS_INFO	_IOW('f', 12, struct ext4_extents_info)
+#define EXT4_IOC_EXTENTS_INFO		_IOW('f', 13, struct ext4_extents_info)
+#define EXT4_IOC_RESERVE_BLOCK		_IOW('f', 14, struct ext4_extents_info)
+#define EXT4_IOC_MOVE_VICTIM		_IOW('f', 15, struct ext4_extents_info)
+#define EXT4_IOC_BLOCK_RELEASE		_IO('f', 8)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -306,6 +331,57 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 
+/*
+ * Will go away.
+ * ext4 online defrag supports only 4KB block size.
+ */
+#define DEFRAG_BLOCK_SIZE	4096
+
+/*
+ * The following four macros are used for the defrag force mode.
+ *
+ * DEFRAG_MAX_ENT:	the maximum number of extents for exchanging between
+ *			kernel-space and user-space per an ioctl
+ * DEFRAG_FORCE_TRY:	check whether we have free space fragmentation or not
+ * DEFRAG_FORCE_VICTIM:	move victim extents to make sufficient space
+ * DEFRAG_FORCE_GATHER:	move the target file into the free space made in the
+ *			DEFRAG_FORCE_VICTIM phase
+ */
+#define DEFRAG_MAX_ENT		32
+#define DEFRAG_FORCE_TRY	1
+#define DEFRAG_FORCE_VICTIM	2
+#define DEFRAG_FORCE_GATHER	3
+
+struct ext4_extent_data {
+	ext4_lblk_t block;		/* start logical block number */
+	ext4_fsblk_t start;		/* start physical block number */
+	int len;			/* blocks count */
+};
+
+struct ext4_ext_defrag_data {
+	ext4_lblk_t start_offset;	/* start offset to defrag in blocks */
+	ext4_lblk_t defrag_size;	/* size of defrag in blocks */
+	ext4_fsblk_t goal;		/* block offset for allocation */
+	int flag;			/* free space mode flag */
+	struct ext4_extent_data ext;
+};
+
+struct ext4_group_data_info {
+	int s_blocks_per_group;		/* blocks per group */
+	int s_inodes_per_group;		/* inodes per group */
+};
+
+struct ext4_extents_info {
+	unsigned long long ino;		/* inode number */
+	int max_entries;		/* maximum extents count */
+	int entries;			/* extent number/count */
+	ext4_lblk_t f_offset;		/* file offset */
+	ext4_grpblk_t g_offset;		/* group offset */
+	ext4_fsblk_t goal;		/* block offset for allocation */
+	struct ext4_extent_data ext[DEFRAG_MAX_ENT];
+};
+
+#define EXT4_TRANS_META_BLOCKS 4 /* bitmap + group desc + sb + inode */
 
 /*
  *  Mount options
@@ -383,6 +459,8 @@ struct ext4_inode {
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
 };
 
+#define EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP (FIEMAP_FLAG_INCOMPAT &	\
+					  ~(FIEMAP_FLAG_LUN_OFFSET))
 
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -527,6 +605,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
@@ -647,7 +726,10 @@ struct ext4_super_block {
 	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -958,12 +1040,17 @@ extern ext4_grpblk_t ext4_block_group_of
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+					ext4_lblk_t iblock, ext4_fsblk_t goal,
+					unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -977,6 +1064,17 @@ extern struct ext4_group_desc * ext4_get
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext4_init_block_alloc_info(struct inode *);
 extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
+extern void try_to_extend_reservation(struct ext4_reserve_window_node *,
+					struct super_block *, int);
+extern int alloc_new_reservation(struct ext4_reserve_window_node *,
+				ext4_grpblk_t, struct super_block *,
+				ext4_group_t, struct buffer_head *);
+extern ext4_grpblk_t bitmap_search_next_usable_block(ext4_grpblk_t,
+				struct buffer_head *, ext4_grpblk_t);
+extern int rsv_is_empty(struct ext4_reserve_window *rsv);
+extern int goal_in_my_reservation(struct ext4_reserve_window *rsv,
+				ext4_grpblk_t grp_goal, ext4_group_t group,
+				struct super_block *sb);
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -986,6 +1084,7 @@ extern int ext4_htree_store_dirent(struc
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern sector_t ext4_bmap(struct address_space *mapping, sector_t block);
 
 /* fsync.c */
 extern int ext4_sync_file (struct file *, struct dentry *, int);
@@ -1016,9 +1115,14 @@ extern int __init init_ext4_mballoc(void
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+		ext4_grpblk_t add);
 
 
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1044,8 +1148,11 @@ extern void ext4_set_inode_flags(struct 
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1094,6 +1201,14 @@ extern void ext4_inode_bitmap_set(struct
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+/* extents.c */
+extern int ext4_ext_journal_restart(handle_t *handle, int needed);
+/* defrag.c */
+extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+			ext4_lblk_t defrag_size, ext4_fsblk_t goal,
+			int flag, struct ext4_extent_data *ext);
+extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
+				unsigned long);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1159,6 +1274,17 @@ struct ext4_group_info *ext4_get_group_i
 }
 
 
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
@@ -1191,7 +1317,7 @@ extern int ext4_ext_get_blocks(handle_t 
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1325,8 @@ extern long ext4_fallocate(struct inode 
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 			sector_t block, unsigned long max_blocks,
 			struct buffer_head *bh, int create,
-			int extend_disksize);
+			int extend_disksize, int flag);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start);
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
Index: linux-2.6/fs/ext4/ext4_i.h
===================================================================
--- linux-2.6.orig/fs/ext4/ext4_i.h	2008-06-05 13:44:20.522046081 -0500
+++ linux-2.6/fs/ext4/ext4_i.h	2008-07-05 12:51:23.558290927 -0500
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
+	struct jbd2_inode jinode;
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
 	/* mballoc */
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
+
+	/* allocation reservation info for delalloc */
+	unsigned long i_reserved_data_blocks;
+	unsigned long i_reserved_meta_blocks;
+	unsigned long i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+	spinlock_t i_block_reservation_lock;
 };
 
 #endif	/* _EXT4_I */
Index: linux-2.6/fs/ext4/ext4_sb.h
===================================================================
--- linux-2.6.orig/fs/ext4/ext4_sb.h	2008-06-05 13:44:20.522046081 -0500
+++ linux-2.6/fs/ext4/ext4_sb.h	2008-07-05 12:51:22.196357153 -0500
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
 };
 
 #endif	/* _EXT4_SB */
Index: linux-2.6/fs/ext4/mballoc.c
===================================================================
--- linux-2.6.orig/fs/ext4/mballoc.c	2008-06-16 11:34:11.869008051 -0500
+++ linux-2.6/fs/ext4/mballoc.c	2008-07-05 12:51:24.558291107 -0500
@@ -803,6 +803,7 @@ static int ext4_mb_init_cache(struct pag
 		if (!buffer_uptodate(bh[i]))
 			goto out;
 
+	err = 0;
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
@@ -883,6 +884,7 @@ ext4_mb_load_buddy(struct super_block *s
 	int pnum;
 	int poff;
 	struct page *page;
+	int ret;
 
 	mb_debug("load group %lu\n", group);
 
@@ -914,15 +916,21 @@ ext4_mb_load_buddy(struct super_block *s
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
 			if (!PageUptodate(page)) {
-				ext4_mb_init_cache(page, NULL);
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
 				mb_cmp_bitmaps(e4b, page_address(page) +
 					       (poff * sb->s_blocksize));
 			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_bitmap_page = page;
 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -938,14 +946,20 @@ ext4_mb_load_buddy(struct super_block *s
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
-			if (!PageUptodate(page))
-				ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_buddy_page = page;
 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -962,7 +976,7 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-	return -EIO;
+	return ret;
 }
 
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1730,10 +1744,6 @@ ext4_mb_regular_allocator(struct ext4_al
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
-
-	/* searching for the right group start from the goal value specified */
-	group = ac->ac_g_ex.fe_group;
-
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -1743,6 +1753,12 @@ ext4_mb_regular_allocator(struct ext4_al
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
 		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
@@ -1750,6 +1766,10 @@ repeat:
 			if (group == EXT4_SB(sb)->s_groups_count)
 				group = 0;
 
+			if (ac->ac_excepted_group != -1 &&
+			    group == ac->ac_excepted_group)
+				continue;
+
 			/* quick check to skip empty groups */
 			grp = ext4_get_group_info(ac->ac_sb, group);
 			if (grp->bb_free == 0)
@@ -1963,6 +1983,8 @@ static int ext4_mb_seq_history_open(stru
 	int rc;
 	int size;
 
+	if (unlikely(sbi->s_mb_history == NULL))
+		return -ENOMEM;
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (s == NULL)
 		return -ENOMEM;
@@ -2165,9 +2187,7 @@ static void ext4_mb_history_init(struct 
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
-	if (likely(sbi->s_mb_history != NULL))
-		memset(sbi->s_mb_history, 0, i);
+	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2215,21 +2235,192 @@ ext4_mb_store_history(struct ext4_alloca
 #define ext4_mb_history_init(sb)
 #endif
 
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i, len;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	/*
+	 * calculate needed size. if change bb_counters size,
+	 * don't forget about ext4_mb_generate_buddy()
+	 */
+	len = offsetof(typeof(**meta_group_info),
+		       bb_counters[sb->s_blocksize_bits + 2]);
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_blocks_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			le16_to_cpu(desc->bg_free_blocks_count);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+			       struct ext4_group_desc *desc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	int blocks_per_page;
+	int block;
+	int pnum;
+	struct page *page;
+	int err;
+
+	/* Add group based on group descriptor*/
+	err = ext4_mb_add_groupinfo(sb, group, desc);
+	if (err)
+		return err;
+
+	/*
+	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+	 * datas) are set not up to date so that they will be re-initilaized
+	 * during the next call to ext4_mb_load_buddy
+	 */
+
+	/* Set buddy page as not up to date */
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	/* Set bitmap page as not up to date */
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+	grp->bb_free += add;
+}
+
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t i;
-	int j, len, metalen;
+	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num_meta_group_infos =
-		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
-			EXT4_DESC_PER_BLOCK_BITS(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int num_meta_group_infos;
+	int num_meta_group_infos_max;
+	int array_size;
 	struct ext4_group_info **meta_group_info;
+	struct ext4_group_desc *desc;
+
+	/* This is the number of blocks used by GDT */
+	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
+	/*
+	 * This is the total number of blocks used by GDT including
+	 * the number of reserved blocks for GDT.
+	 * The s_group_info array is allocated with this value
+	 * to allow a clean online resize without a complex
+	 * manipulation of pointer.
+	 * The drawback is the unused memory when no resize
+	 * occurs but it's very low in terms of pages
+	 * (see comments below)
+	 * Need to handle this properly when META_BG resizing is allowed
+	 */
+	num_meta_group_infos_max = num_meta_group_infos +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/*
+	 * array_size is the size of s_group_info array. We round it
+	 * to the next power of two because this approximation is done
+	 * internally by kmalloc so we can have some more memory
+	 * for free here (e.g. may be used for META_BG resize).
+	 */
+	array_size = 1;
+	while (array_size < sizeof(*sbi->s_group_info) *
+	       num_meta_group_infos_max)
+		array_size = array_size << 1;
 	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
 	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
 	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
-				    num_meta_group_infos, GFP_KERNEL);
+	sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
 		return -ENOMEM;
@@ -2256,63 +2447,15 @@ static int ext4_mb_init_backend(struct s
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = sizeof(struct ext4_group_info);
-	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext4_group_desc *desc;
-
-		meta_group_info =
-			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
-		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-		if (meta_group_info[j] == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			goto err_freebuddy;
-		}
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
 				"EXT4-fs: can't read descriptor %lu\n", i);
-			i++;
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
-		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
-			&(meta_group_info[j]->bb_state));
-
-		/*
-		 * initialize bb_free to be able to skip
-		 * empty groups without initialization
-		 */
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			meta_group_info[j]->bb_free =
-				ext4_free_blocks_after_init(sb, i, desc);
-		} else {
-			meta_group_info[j]->bb_free =
-				le16_to_cpu(desc->bg_free_blocks_count);
-		}
-
-		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
-		{
-			struct buffer_head *bh;
-			meta_group_info[j]->bb_bitmap =
-				kmalloc(sb->s_blocksize, GFP_KERNEL);
-			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = read_block_bitmap(sb, i);
-			BUG_ON(bh == NULL);
-			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-					sb->s_blocksize);
-			put_bh(bh);
-		}
-#endif
-
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
 	}
 
 	return 0;
@@ -2336,6 +2479,7 @@ int ext4_mb_init(struct super_block *sb,
 	unsigned i;
 	unsigned offset;
 	unsigned max;
+	int ret;
 
 	if (!test_opt(sb, MBALLOC))
 		return 0;
@@ -2370,12 +2514,12 @@ int ext4_mb_init(struct super_block *sb,
 	} while (i <= sb->s_blocksize_bits + 1);
 
 	/* init file for buddy data */
-	i = ext4_mb_init_backend(sb);
-	if (i) {
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0) {
 		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		return i;
+		return ret;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct sup
 
 
 
-#define MB_PROC_VALUE_READ(name)				\
-static int ext4_mb_read_##name(char *page, char **start,	\
-		off_t off, int count, int *eof, void *data)	\
+#define MB_PROC_FOPS(name)					\
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
-	int len;						\
-	*eof = 1;						\
-	if (off != 0)						\
-		return 0;					\
-	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
-	*start = page;						\
-	return len;						\
-}
-
-#define MB_PROC_VALUE_WRITE(name)				\
-static int ext4_mb_write_##name(struct file *file,		\
-		const char __user *buf, unsigned long cnt, void *data)	\
+	struct ext4_sb_info *sbi = m->private;			\
+								\
+	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
+	return 0;						\
+}								\
+								\
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{								\
+	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}								\
+								\
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
+		const char __user *buf, size_t cnt, loff_t *ppos)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
+	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
 	char str[32];						\
 	long value;						\
 	if (cnt >= sizeof(str))					\
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct f
 		return -ERANGE;					\
 	sbi->s_mb_##name = value;				\
 	return cnt;						\
-}
+}								\
+								\
+static const struct file_operations ext4_mb_##name##_proc_fops = {	\
+	.owner		= THIS_MODULE,				\
+	.open		= ext4_mb_##name##_proc_open,		\
+	.read		= seq_read,				\
+	.llseek		= seq_lseek,				\
+	.release	= single_release,			\
+	.write		= ext4_mb_##name##_proc_write,		\
+};
 
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
 
 #define	MB_PROC_HANDLER(name, var)					\
 do {									\
-	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
+	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
+				&ext4_mb_##var##_proc_fops, sbi);	\
 	if (proc == NULL) {						\
 		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
 		goto err_out;						\
 	}								\
-	proc->data = sbi;						\
-	proc->read_proc  = ext4_mb_read_##var ;				\
-	proc->write_proc = ext4_mb_write_##var;				\
 } while (0)
 
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2747,7 +2891,7 @@ ext4_mb_mark_diskspace_used(struct ext4_
 
 
 	err = -EIO;
-	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
 	if (!bitmap_bh)
 		goto out_err;
 
@@ -2816,7 +2960,23 @@ ext4_mb_mark_diskspace_used(struct ext4_
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+	/*
+	 * free blocks account has already be reduced/reserved
+	 * at write_begin() time for delayed allocation
+	 * do not double accounting
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		percpu_counter_sub(&sbi->s_freeblocks_counter,
+					ac->ac_b_ex.fe_len);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 	if (err)
@@ -3569,7 +3729,7 @@ ext4_mb_discard_group_preallocations(str
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
 
-	bitmap_bh = read_block_bitmap(sb, group);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		/* error handling here */
 		ext4_mb_release_desc(&e4b);
@@ -3743,7 +3903,7 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		BUG_ON(err != 0); /* error handling here */
 
-		bitmap_bh = read_block_bitmap(sb, group);
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			/* error handling here */
 			ext4_mb_release_desc(&e4b);
@@ -3934,6 +4094,7 @@ ext4_mb_initialize_context(struct ext4_a
 	ac->ac_bitmap_page = NULL;
 	ac->ac_buddy_page = NULL;
 	ac->ac_lg = NULL;
+	ac->ac_excepted_group = ar->excepted_group;
 
 	/* we have to define context: we'll we work with a file or
 	 * locality group. this is a policy, actually */
@@ -4011,10 +4172,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
 	sbi = EXT4_SB(sb);
 
 	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
 					    &(ar->len), errp);
 		return block;
 	}
+	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		ar->len = ext4_has_free_blocks(sbi, ar->len);
+	}
+
+	if (ar->len == 0) {
+		*errp = -ENOSPC;
+		return 0;
+	}
 
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4198,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
 	}
 	inquota = ar->len;
 
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
+		ar->len = 0;
 		*errp = -ENOMEM;
-		return 0;
+		goto out1;
 	}
 
 	ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4213,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
-		goto out;
+		goto out2;
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	if (!ext4_mb_use_preallocated(ac)) {
-
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
 repeat:
@@ -4085,11 +4260,12 @@ repeat:
 
 	ext4_mb_release_context(ac);
 
-out:
+out2:
+	kmem_cache_free(ext4_ac_cachep, ac);
+out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
 
-	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4418,7 @@ do_more:
 		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4321,6 +4497,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	ext4_mb_release_desc(&e4b);
 
 	*freed += count;
Index: linux-2.6/fs/ext4/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext4/balloc.c	2008-06-16 11:34:11.860451934 -0500
+++ linux-2.6/fs/ext4/balloc.c	2008-07-05 12:51:24.428291692 -0500
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct su
 			ext4_group_t block_group)
 {
 	ext4_group_t actual_group;
-	ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
 	if (actual_group == block_group)
 		return 1;
 	return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct s
 				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
 		}
 	} else { /* For META_BG_BLOCK_GROUPS */
-		int group_rel = (block_group -
-				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-				EXT4_DESC_PER_BLOCK(sb);
-		if (group_rel == 0 || group_rel == 1 ||
-		    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-			bit_max += 1;
+		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
 	if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
 	return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -305,7 +300,7 @@ err_out:
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
 		prev = rsv;
 	}
 	printk("Window map complete.\n");
-	if (bad)
-		BUG();
+	BUG_ON(bad);
 }
 #define rsv_window_dump(root, verbose) \
 	__rsv_window_dump((root), (verbose), __func__)
@@ -434,7 +428,7 @@ restart:
  * If the goal block is within the reservation window, return 1;
  * otherwise, return 0;
  */
-static int
+int
 goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
 			ext4_group_t group, struct super_block *sb)
 {
@@ -539,7 +533,7 @@ void ext4_rsv_window_add(struct super_bl
  * from the filesystem reservation window rb tree. Must be called with
  * rsv_lock hold.
  */
-static void rsv_window_remove(struct super_block *sb,
+void rsv_window_remove(struct super_block *sb,
 			      struct ext4_reserve_window_node *rsv)
 {
 	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
@@ -554,7 +548,7 @@ static void rsv_window_remove(struct sup
  *
  * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
  */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
+inline int rsv_is_empty(struct ext4_reserve_window *rsv)
 {
 	/* a valid reservation end block could not be 0 */
 	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
@@ -694,7 +688,7 @@ do_more:
 		count -= overflow;
 	}
 	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -913,7 +914,7 @@ static int ext4_test_allocatable(ext4_gr
  * bitmap on disk and the last-committed copy in journal, until we find a
  * bit free in both bitmaps.
  */
-static ext4_grpblk_t
+ext4_grpblk_t
 bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
 					ext4_grpblk_t maxblocks)
 {
@@ -1283,7 +1284,7 @@ static int find_next_reservable_window(
  *	@bitmap_bh: the block group block bitmap
  *
  */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
+int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
 		ext4_grpblk_t grp_goal, struct super_block *sb,
 		ext4_group_t group, struct buffer_head *bitmap_bh)
 {
@@ -1427,7 +1428,7 @@ retry:
  * expand the reservation window size if necessary on a best-effort
  * basis before ext4_new_blocks() tries to allocate blocks,
  */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
+void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
 			struct super_block *sb, int size)
 {
 	struct ext4_reserve_window_node *next_rsv;
@@ -1598,23 +1599,35 @@ out:
 
 /**
  * ext4_has_free_blocks()
- * @sbi:		in-core super block structure.
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of neeed blocks
  *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
 {
-	ext4_fsblk_t free_blocks, root_blocks;
+	ext4_fsblk_t free_blocks;
+	ext4_fsblk_t root_blocks = 0;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
-}
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+	if (free_blocks - root_blocks < FBC_BATCH)
+		free_blocks =
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+	if (free_blocks - root_blocks < nblocks)
+		return free_blocks - root_blocks;
+	return nblocks;
+ }
+
 
 /**
  * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct e
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super
 }
 
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
  * @handle:		handle to this transaction
  * @inode:		file inode
  * @goal:		given target block(filesystem wide)
  * @count:		target number of blocks to allocate
  * @errp:		error code
  *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
  *
  */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_
 	ext4_group_t ngroups;
 	unsigned long num = *count;
 
-	*errp = -ENOSPC;
 	sb = inode->i_sb;
 	if (!sb) {
+		*errp = -ENODEV;
 		printk("ext4_new_block: nonexistent device");
 		return 0;
 	}
 
+	sbi = EXT4_SB(sb);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		*count = ext4_has_free_blocks(sbi, *count);
+	}
+	if (*count == 0) {
+		*errp = -ENOSPC;
+		return 0;	/*return with ENOSPC error */
+	}
+	num = *count;
+
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext4_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
 	/*
 	 * First, test whether the goal block is free.
 	 */
@@ -1734,7 +1759,7 @@ retry_alloc:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
 			continue;
 
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		/*
@@ -1882,7 +1907,15 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= num;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
 	return 0;
 }
 
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp, int flags)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
 	if (!test_opt(inode->i_sb, MBALLOC)) {
-		unsigned long count = 1;
-		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-		return ret;
+		return ext4_old_new_blocks(handle, inode, goal, count, errp);
 	}
 
 	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+
 	ar.inode = inode;
 	ar.goal = goal;
-	ar.len = 1;
+	ar.len = *count;
+	ar.logical = iblock;
+
+	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+		/* enable in-core preallocation for data block allocation */
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	*count = ar.len;
 	return ret;
 }
 
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
-
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
+	ret = do_blk_alloc(handle, inode, 0, goal,
+				count, errp, EXT4_META_BLOCK);
+	/*
+	 * Account for the allocated meta blocks
+	 */
+	if (!(*errp)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	}
-
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
 	return ret;
 }
 
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp)
+{
+	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(stru
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
Index: linux-2.6/fs/ext4/dir.c
===================================================================
--- linux-2.6.orig/fs/ext4/dir.c	2008-06-05 13:44:20.521046407 -0500
+++ linux-2.6/fs/ext4/dir.c	2008-07-05 12:51:23.713291581 -0500
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * fi
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+						0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file *
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext4_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
Index: linux-2.6/fs/ext4/xattr_trusted.c
===================================================================
--- linux-2.6.orig/fs/ext4/xattr_trusted.c	2008-06-05 13:44:20.539983023 -0500
+++ linux-2.6/fs/ext4/xattr_trusted.c	2008-07-05 12:51:21.985654007 -0500
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
Index: linux-2.6/fs/ext4/xattr_user.c
===================================================================
--- linux-2.6.orig/fs/ext4/xattr_user.c	2008-06-05 13:44:20.539983023 -0500
+++ linux-2.6/fs/ext4/xattr_user.c	2008-07-05 12:51:21.993609879 -0500
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
Index: linux-2.6/fs/ext4/group.h
===================================================================
--- linux-2.6.orig/fs/ext4/group.h	2008-06-05 13:44:20.524045910 -0500
+++ linux-2.6/fs/ext4/group.h	2008-07-05 12:51:22.039290556 -0500
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struc
 				   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
 				       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 				      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 				       struct buffer_head *bh,
Index: linux-2.6/fs/ext4/ialloc.c
===================================================================
--- linux-2.6.orig/fs/ext4/ialloc.c	2008-06-05 13:44:20.524045910 -0500
+++ linux-2.6/fs/ext4/ialloc.c	2008-07-05 12:51:22.214291112 -0500
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, 
 	struct ext4_super_block * es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
+	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, 
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
+			if (sbi->s_log_groups_per_flex) {
+				flex_group = ext4_flex_group(sbi, block_group);
+				spin_lock(sb_bgl_lock(sbi, flex_group));
+				sbi->s_flex_groups[flex_group].free_inodes++;
+				spin_unlock(sb_bgl_lock(sbi, flex_group));
+			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_b
 	return ret;
 }
 
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+			   ext4_group_t *best_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	struct flex_groups *flex_group = sbi->s_flex_groups;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	int flex_size = ext4_flex_bg_size(sbi);
+	ext4_group_t best_flex = parent_fbg_group;
+	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+	int flexbg_free_blocks;
+	int flex_freeb_ratio;
+	ext4_group_t n_fbg_groups;
+	ext4_group_t i;
+
+	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+		sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	if (flex_group[best_flex].free_inodes &&
+	    flex_freeb_ratio > free_block_ratio)
+		goto found_flexbg;
+
+	if (best_flex && best_flex == parent_fbg_group) {
+		best_flex--;
+		goto find_close_to_parent;
+	}
+
+	for (i = 0; i < n_fbg_groups; i++) {
+		if (i == parent_fbg_group || i == parent_fbg_group - 1)
+			continue;
+
+		flexbg_free_blocks = flex_group[i].free_blocks;
+		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+		if (flex_freeb_ratio > free_block_ratio &&
+		    flex_group[i].free_inodes) {
+			best_flex = i;
+			goto found_flexbg;
+		}
+
+		if (best_flex < 0 ||
+		    (flex_group[i].free_blocks >
+		     flex_group[best_flex].free_blocks &&
+		     flex_group[i].free_inodes))
+			best_flex = i;
+	}
+
+	if (!flex_group[best_flex].free_inodes ||
+	    !flex_group[best_flex].free_blocks)
+		return -1;
+
+found_flexbg:
+	for (i = best_flex * flex_size; i < ngroups &&
+		     i < (best_flex + 1) * flex_size; i++) {
+		desc = ext4_get_group_desc(sb, i, &bh);
+		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+			*best_group = i;
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	return 0;
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *h
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *h
 
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
+
+	if (sbi->s_log_groups_per_flex) {
+		ret2 = find_group_flex(sb, dir, &group);
+		goto got_group;
+	}
+
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *h
 	} else
 		ret2 = find_group_other(sb, dir, &group);
 
+got_group:
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -600,7 +689,7 @@ got:
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
 
 		BUFFER_TRACE(block_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
 		percpu_counter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_inodes--;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	inode->i_uid = current->fsuid;
 	if (test_opt (sb, GRPID))
 		inode->i_gid = dir->i_gid;
Index: linux-2.6/fs/ext4/super.c
===================================================================
--- linux-2.6.orig/fs/ext4/super.c	2008-06-16 11:34:11.903810444 -0500
+++ linux-2.6/fs/ext4/super.c	2008-07-05 12:51:23.998291006 -0500
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
+	kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(st
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_delalloc_reserved_flag = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
 	return &ei->vfs_inode;
 }
 
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inod
 	EXT4_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
+	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+				       &EXT4_I(inode)->jinode);
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_
 	unsigned long def_mount_opts;
 	struct super_block *sb = vfs->mnt_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	journal_t *journal = sbi->s_journal;
 	struct ext4_super_block *es = sbi->s_es;
 
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_
 		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
+	if (!test_opt(sb, DELALLOC))
+		seq_puts(seq, ",nodelalloc");
+
 
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 
 static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_err, NULL},
 };
 
@@ -990,6 +1004,7 @@ static int parse_options (char *options,
 	int qtype, qfmt;
 	char *qname;
 #endif
+	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1312,12 +1327,29 @@ set_qf_format:
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
+			/*
+			 * When e2fsprogs support resizing an already existing
+			 * ext3 file system to greater than 2**32 we need to
+			 * add support to block allocator to handle growing
+			 * already existing block  mapped inode so that blocks
+			 * allocated for them fall within 2**32
+			 */
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+			if (last_block  > 0xffffffffULL) {
+				printk(KERN_ERR "EXT4-fs: Filesystem too "
+						"large to mount with "
+						"-o noextents options\n");
+				return 0;
+			}
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
+		case Opt_nodelalloc:
+			clear_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		case Opt_mballoc:
 			set_opt(sbi->s_mount_opt, MBALLOC);
 			break;
@@ -1331,6 +1363,9 @@ set_qf_format:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_delalloc:
+			set_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1478,54 @@ static int ext4_setup_super(struct super
 	return res;
 }
 
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *bh;
+	ext4_group_t flex_group_count;
+	ext4_group_t flex_group;
+	int groups_per_flex = 0;
+	__u64 block_bitmap = 0;
+	int i;
+
+	if (!sbi->s_es->s_log_groups_per_flex) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+		groups_per_flex;
+	sbi->s_flex_groups = kmalloc(flex_group_count *
+				     sizeof(struct flex_groups), GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		goto failed;
+	}
+	memset(sbi->s_flex_groups, 0, flex_group_count *
+	       sizeof(struct flex_groups));
+
+	gdp = ext4_get_group_desc(sb, 1, &bh);
+	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, &bh);
+
+		flex_group = ext4_flex_group(sbi, i);
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			le16_to_cpu(gdp->bg_free_inodes_count);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 			    struct ext4_group_desc *gdp)
 {
@@ -1810,8 +1893,8 @@ static unsigned long ext4_get_stripe_siz
 }
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-				__releases(kernel_sem)
-				__acquires(kernel_sem)
+				__releases(kernel_lock)
+				__acquires(kernel_lock)
 
 {
 	struct buffer_head * bh;
@@ -1851,11 +1934,6 @@ static int ext4_fill_super (struct super
 		goto out_fail;
 	}
 
-	if (!sb_set_blocksize(sb, blocksize)) {
-		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-		goto out_fail;
-	}
-
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
@@ -1928,6 +2006,13 @@ static int ext4_fill_super (struct super
 	 */
 	set_opt(sbi->s_mount_opt, MBALLOC);
 
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	set_opt(sbi->s_mount_opt, DELALLOC);
+
+
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
 		goto failed_mount;
@@ -2138,6 +2223,14 @@ static int ext4_fill_super (struct super
 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			printk(KERN_ERR
+			       "EXT4-fs: unable to initialize "
+			       "flex_bg meta info!\n");
+			goto failed_mount2;
+		}
+
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2451,13 @@ static int ext4_fill_super (struct super
 		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
 	ext4_ext_init(sb);
 	ext4_mb_init(sb, needs_recovery);
 
@@ -2372,6 +2472,7 @@ cantfind_ext4:
 
 failed_mount4:
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3426,7 @@ static ssize_t ext4_quota_write(struct s
 			err = ext4_journal_dirty_metadata(handle, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 			mark_buffer_dirty(bh);
 		}
 		brelse(bh);
@@ -3337,8 +3438,10 @@ static ssize_t ext4_quota_write(struct s
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite) {
 		i_size_write(inode, off+len-towrite);
 		EXT4_I(inode)->i_disksize = inode->i_size;
Index: linux-2.6/fs/jbd2/commit.c
===================================================================
--- linux-2.6.orig/fs/jbd2/commit.c	2008-06-16 11:34:12.070629463 -0500
+++ linux-2.6/fs/jbd2/commit.c	2008-07-05 12:51:23.906290812 -0500
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(s
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
  * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record
 }
 
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
  */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
 {
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+		.for_writepages = 1,
+	};
 
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
+	ret = generic_writepages(mapping, &wbc);
 	return ret;
-  }
+}
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	int i;
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
 
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		err = journal_submit_inode_data_buffers(mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
 }
 
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
  */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
 
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
+	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
 
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
 		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
+			jinode->i_transaction = NULL;
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+
+	return ret;
 }
 
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(jou
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
+	err = journal_submit_data_buffers(journal, commit_transaction);
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(jou
 	jbd_debug(3, "JBD: commit phase 2\n");
 
 	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
-	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(jou
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
Index: linux-2.6/include/linux/jbd2.h
===================================================================
--- linux-2.6.orig/include/linux/jbd2.h	2008-06-16 11:34:12.762013789 -0500
+++ linux-2.6/include/linux/jbd2.h	2008-07-05 12:51:23.276619947 -0500
@@ -168,6 +168,8 @@ struct commit_header {
 	unsigned char   h_chksum_size;
 	unsigned char 	h_padding[2];
 	__be32 		h_chksum[JBD2_CHECKSUM_BYTES];
+	__be64		h_commit_sec;
+	__be32		h_commit_nsec;
 };
 
 /*
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
 }
 
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ *   present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+	/* Which transaction does this inode belong to? Either the running
+	 * transaction or the committing one. [j_list_lock] */
+	transaction_t *i_transaction;
+
+	/* Pointer to the running transaction modifying inode's data in case
+	 * there is already a committing transaction touching it. [j_list_lock] */
+	transaction_t *i_next_transaction;
+
+	/* List of inodes in the i_transaction [j_list_lock] */
+	struct list_head i_list;
+
+	/* VFS inode this inode belongs to [constant during the lifetime
+	 * of the structure] */
+	struct inode *i_vfs_inode;
+
+	/* Flags of inode [j_list_lock] */
+	unsigned int i_flags;
+};
+
 struct jbd2_revoke_table_s;
 
 /**
@@ -509,24 +543,12 @@ struct transaction_s
 	struct journal_head	*t_reserved_list;
 
 	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
-	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
 	 */
 	struct journal_head	*t_buffers;
 
 	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
-	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
 	 * [j_list_lock]
@@ -565,6 +587,12 @@ struct transaction_s
 	struct journal_head	*t_log_list;
 
 	/*
+	 * List of inodes whose data we've modified in data=ordered mode.
+	 * [j_list_lock]
+	 */
+	struct list_head	t_inode_list;
+
+	/*
 	 * Protects info related to handles
 	 */
 	spinlock_t		t_handle_lock;
@@ -1004,7 +1032,6 @@ extern int	 jbd2_journal_extend (handle_
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1044,6 +1071,10 @@ extern void	   jbd2_journal_ack_err    (
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
 /*
  * journal_head management
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journ
 
 /* journaling buffer types */
 #define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Metadata	1	/* Normal journaled metadata */
+#define BJ_Forget	2	/* Buffer superseded by this transaction */
+#define BJ_IO		3	/* Buffer is for temporary IO use */
+#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
+#define BJ_LogCtl	5	/* Buffer contains log descriptors */
+#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
+#define BJ_Types	7
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
Index: linux-2.6/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.orig/fs/jbd2/transaction.c	2008-06-05 13:44:21.036983179 -0500
+++ linux-2.6/fs/jbd2/transaction.c	2008-07-05 12:51:23.260291032 -0500
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_b
  *	new transaction	and we can't block without protecting against other
  *	processes trying to touch the journal while it is in transition.
  *
- * Called under j_state_lock
  */
 
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal,
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
-/**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head *
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(s
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(s
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
 	return;
 }
 
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_log_wait_commit(journal, tid);
+}
 
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(jou
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+		 * jbd2_journal_remove_journal_head() and
+		 * jbd2_journal_put_journal_head().
 		 */
 		jh = jbd2_journal_grab_journal_head(bh);
 		if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(jou
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where jbd2_journal_try_to_free_buffers()
+	 * could race with jbd2_journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		jbd2_journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct j
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct j
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
Index: linux-2.6/fs/ext4/extents.c
===================================================================
--- linux-2.6.orig/fs/ext4/extents.c	2008-06-16 11:32:42.791046936 -0500
+++ linux-2.6/fs/ext4/extents.c	2008-07-05 12:51:24.673291650 -0500
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
@@ -48,7 +49,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;
 
@@ -92,17 +93,16 @@ static void ext4_idx_store_pblock(struct
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
 	if (handle->h_buffer_credits > needed)
-		return handle;
-	if (!ext4_journal_extend(handle, needed))
-		return handle;
-	err = ext4_journal_restart(handle, needed);
-
-	return handle;
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err)
+		return err;
+	return ext4_journal_restart(handle, needed);
 }
 
 /*
@@ -142,7 +142,7 @@ static int ext4_ext_dirty(handle_t *hand
 	return err;
 }
 
-static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 			      struct ext4_ext_path *path,
 			      ext4_lblk_t block)
 {
@@ -180,15 +180,24 @@ static ext4_fsblk_t ext4_ext_find_goal(s
 	return bg_start + colour + block;
 }
 
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
-			struct ext4_extent *ex, int *err)
+			struct ext4_extent *ex, int *err,
+			ext4_fsblk_t defrag_goal)
 {
 	ext4_fsblk_t goal, newblock;
 
-	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_block(handle, inode, goal, err);
+	if (defrag_goal)
+		goal = defrag_goal;
+	else
+		goal = ext4_ext_find_goal(inode, path,
+					le32_to_cpu(ex->ee_block));
+
+	newblock = ext4_new_meta_block(handle, inode, goal, err);
 	return newblock;
 }
 
@@ -246,6 +255,36 @@ static int ext4_ext_space_root_idx(struc
 	return size;
 }
 
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int lcap, icap, rcap, leafs, idxs, num;
+	int newextents = blocks;
+
+	rcap = ext4_ext_space_root_idx(inode);
+	lcap = ext4_ext_space_block(inode);
+	icap = ext4_ext_space_block_idx(inode);
+
+	/* number of new leaf blocks needed */
+	num = leafs = (newextents + lcap - 1) / lcap;
+
+	/*
+	 * Worse case, we need separate index block(s)
+	 * to link all new leaf blocks
+	 */
+	idxs = (leafs + icap - 1) / icap;
+	do {
+		num += idxs;
+		idxs = (idxs + icap - 1) / icap;
+	} while (idxs > rcap);
+
+	return num;
+}
+
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +563,7 @@ ext4_ext_find_extent(struct inode *inode
 		alloc = 1;
 	}
 	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
 
 	i = depth;
 	/* walk through the tree */
@@ -552,12 +592,14 @@ ext4_ext_find_extent(struct inode *inode
 	}
 
 	path[ppos].p_depth = i;
-	path[ppos].p_hdr = eh;
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -638,7 +680,8 @@ static int ext4_ext_insert_index(handle_
  */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path,
-				struct ext4_extent *newext, int at)
+				struct ext4_extent *newext, int at,
+				ext4_fsblk_t defrag_goal)
 {
 	struct buffer_head *bh = NULL;
 	int depth = ext_depth(inode);
@@ -688,7 +731,8 @@ static int ext4_ext_split(handle_t *hand
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
-		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err, defrag_goal);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -875,7 +919,8 @@ cleanup:
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+					struct ext4_extent *newext,
+					ext4_fsblk_t defrag_goal)
 {
 	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
@@ -884,7 +929,8 @@ static int ext4_ext_grow_indepth(handle_
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, path,
+					   newext, &err, defrag_goal);
 	if (newblock == 0)
 		return err;
 
@@ -960,7 +1006,8 @@ out:
  */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
 					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+					struct ext4_extent *newext,
+					ext4_fsblk_t defrag_goal)
 {
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
@@ -980,7 +1027,10 @@ repeat:
 	if (EXT_HAS_FREE_INDEX(curp)) {
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
-		err = ext4_ext_split(handle, inode, path, newext, i);
+		err = ext4_ext_split(handle, inode, path, newext, i,
+					defrag_goal);
+		if (err)
+			goto out;
 
 		/* refill path */
 		ext4_ext_drop_refs(path);
@@ -991,7 +1041,8 @@ repeat:
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, path,
+						newext, defrag_goal);
 		if (err)
 			goto out;
 
@@ -1171,7 +1222,7 @@ ext4_ext_search_right(struct inode *inod
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -1437,6 +1488,19 @@ int ext4_ext_insert_extent(handle_t *han
 				struct ext4_ext_path *path,
 				struct ext4_extent *newext)
 {
+	return ext4_ext_insert_extent_defrag(handle, inode, path, newext, 0);
+}
+
+/*
+ * ext4_ext_insert_extent_defrag:
+ * The difference from ext4_ext_insert_extent is to use the first block
+ * in newext as the goal of the new index block.
+ */
+int
+ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_extent *newext, int defrag)
+{
 	struct ext4_extent_header * eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
@@ -1444,6 +1508,7 @@ int ext4_ext_insert_extent(handle_t *han
 	int depth, len, err;
 	ext4_lblk_t next;
 	unsigned uninitialized = 0;
+	ext4_fsblk_t defrag_goal;
 
 	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
 	depth = ext_depth(inode);
@@ -1504,11 +1569,16 @@ repeat:
 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 	}
 
+	if (defrag)
+		defrag_goal = ext_pblock(newext);
+	else
+		defrag_goal = 0;
 	/*
 	 * There is no free space in the found leaf.
 	 * We're gonna add a new leaf in the tree.
 	 */
-	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+	err = ext4_ext_create_new_leaf(handle, inode, path,
+					newext, defrag_goal);
 	if (err)
 		goto cleanup;
 	depth = ext_depth(inode);
@@ -1587,6 +1657,112 @@ cleanup:
 	return err;
 }
 
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			ext4_lblk_t num, ext_prepare_callback func,
+			void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	ext4_lblk_t next, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCK) {
+		num = last - block;
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, block, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		BUG_ON(path[depth].p_hdr == NULL);
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+ 			 * by found extent
+ 			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+			cbex.ec_type = EXT4_EXT_CACHE_GAP;
+		} else {
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+		}
+
+		BUG_ON(cbex.ec_len == 0);
+		err = func(inode, path, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start, int type)
@@ -1883,11 +2059,9 @@ ext4_ext_rm_leaf(handle_t *handle, struc
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 
-		handle = ext4_ext_journal_restart(handle, credits);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
+		err = ext4_ext_journal_restart(handle, credits);
+		if (err)
 			goto out;
-		}
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -1956,7 +2130,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
@@ -2529,6 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle
 	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 	struct ext4_allocation_request ar;
+	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2791,7 @@ int ext4_ext_get_blocks(handle_t *handle
 				 */
 				if (allocated > max_blocks)
 					allocated = max_blocks;
-				/* mark the buffer unwritten */
-				__set_bit(BH_Unwritten, &bh_result->b_state);
+				set_buffer_unwritten(bh_result);
 				goto out2;
 			}
 
@@ -2716,14 +2890,19 @@ int ext4_ext_get_blocks(handle_t *handle
 		goto out2;
 	}
 
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
-
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	__set_bit(BH_New, &bh_result->b_state);
+	if (extend_disksize) {
+		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+	}
+
+	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
 	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2912,7 @@ out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
 	ext4_ext_show_leaf(inode, path);
-	__set_bit(BH_Mapped, &bh_result->b_state);
+	set_buffer_mapped(bh_result);
 	bh_result->b_bdev = inode->i_sb->s_bdev;
 	bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2923,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
@@ -2757,18 +2936,14 @@ void ext4_ext_truncate(struct inode * in
 	 */
 	err = ext4_writepage_trans_blocks(inode) + 3;
 	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;
-	}
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2955,6 @@ void ext4_ext_truncate(struct inode * in
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
 	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2911,7 +3084,7 @@ retry:
 		}
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
@@ -2945,3 +3118,183 @@ retry:
 	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
+
+struct fiemap_internal {
+	struct fiemap		*fiemap_s;
+	struct fiemap_extent	fm_extent;
+	size_t			tot_mapping_len;
+	char			*cur_ext_ptr;
+	int			current_extent;
+	int			err;
+};
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
+{
+	struct fiemap_internal *fiemap_i = data;
+	struct fiemap *fiemap_s = fiemap_i->fiemap_s;
+	struct fiemap_extent *fm_extent = &fiemap_i->fm_extent;
+	int current_extent = fiemap_i->current_extent;
+	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * ext4_ext_walk_space returns a hole for extents that have not been
+	 * allocated yet.
+	 */
+	if (((u64)(newex->ec_block + newex->ec_len) << blksize_bits >=
+	     inode->i_size) && newex->ec_type == EXT4_EXT_CACHE_GAP) {
+		if (((u64)newex->ec_block << blksize_bits) < inode->i_size)
+			newex->ec_len = (inode->i_size - ((u64)newex->ec_block<<
+						blksize_bits)) >> blksize_bits;
+		else
+			return EXT_BREAK;
+	}
+
+	/*
+	 * We only need to return number of extents and total length of mapping
+	 */
+	if (fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS) {
+		fiemap_i->tot_mapping_len += ((__u64)newex->ec_len <<
+						 blksize_bits);
+		goto count_extents;
+	}
+
+	if (current_extent >= fiemap_s->fm_extent_count)
+		return EXT_BREAK;
+
+	/* caller's start should be set to the start of the first extent (or hole...?) */
+	if (newex->ec_block << blksize_bits < fiemap_s->fm_start)
+		fiemap_s->fm_start = newex->ec_block << blksize_bits;
+
+	memset(fm_extent, 0, sizeof(*fm_extent));
+	fm_extent->fe_offset = (__u64)newex->ec_start << blksize_bits;
+	fm_extent->fe_length = (__u64)newex->ec_len << blksize_bits;
+	fiemap_i->tot_mapping_len += fm_extent->fe_length;  /* move this above the goto? */
+
+	if (newex->ec_type == EXT4_EXT_CACHE_GAP)
+		fm_extent->fe_flags |= FIEMAP_EXTENT_HOLE;
+	else if (ex && ext4_ext_is_uninitialized(ex))
+		fm_extent->fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	/*
+	 * Mark this fiemap_extent as FIEMAP_EXTENT_EOF if it's past the end
+	 * of file.
+	 */
+		/* block + len to bytes... >= size? check off by one */
+	if ((u64)(newex->ec_block + newex->ec_len) << blksize_bits >=
+								inode->i_size)
+		fm_extent->fe_flags |= FIEMAP_EXTENT_EOF;
+		// XXX ERS HACK AROUND _LAST problem
+		//fm_extent->fe_flags |= (FIEMAP_EXTENT_EOF|FIEMAP_EXTENT_LAST);
+
+	if (!copy_to_user(fiemap_i->cur_ext_ptr, fm_extent,
+			  sizeof(struct fiemap_extent))) {
+		/* c_t_u succeeded, advance current exent ptr to next */
+		fiemap_i->cur_ext_ptr += sizeof(struct fiemap_extent);
+	} else {
+		fiemap_i->err = -EFAULT;
+		return EXT_BREAK;
+	}
+
+count_extents:
+	/*
+	 * Don't count holes when only returning number of extents
+	 * XXX ERS hm, ok if that's how it's defined...
+	 */
+	if (!((fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS) &&
+	      (newex->ec_type == EXT4_EXT_CACHE_GAP)))
+		fiemap_i->current_extent++; /* hmm why? oh, advance count */
+
+	/*
+	 * Stop if we are beyond requested mapping size but return complete last
+	 * extent.
+	 */
+
+	/* is this extent's last byte >= length of mapping?
+	 * (XXX really?  not start+length of mapping? */
+	if ((u64)(newex->ec_block + newex->ec_len) << blksize_bits >=
+	    fiemap_s->fm_length)
+		return EXT_BREAK;
+
+	return EXT_CONTINUE;
+}
+
+int ext4_fiemap(struct inode *inode, unsigned long arg)
+{
+	struct fiemap *fiemap_s;
+	struct fiemap_internal fiemap_i;
+	struct fiemap_extent *last_extent;
+	ext4_lblk_t start_blk;
+	int fm_extent_size = sizeof(struct fiemap_extent);
+	int err = 0;
+
+	/* could use getblock here for non-extent files? */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return -EOPNOTSUPP;
+
+	fiemap_s = kmalloc(sizeof(*fiemap_s), GFP_KERNEL);
+	if (fiemap_s == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(fiemap_s, (struct fiemap __user *)arg,
+			   sizeof(*fiemap_s))) {
+		err = -EFAULT;
+		goto out_free;
+	}
+
+	/* bail on unsupported flags for this fs */
+	if (fiemap_s->fm_flags & EXT4_FIEMAP_FLAG_INCOMPAT_UNSUPP) {
+		err = -EOPNOTSUPP;
+		goto out_free;
+	}
+
+	start_blk = fiemap_s->fm_start >> inode->i_sb->s_blocksize_bits;
+	fiemap_i.fiemap_s = fiemap_s;
+	fiemap_i.tot_mapping_len = 0;
+	fiemap_i.cur_ext_ptr = (char *)(arg + sizeof(*fiemap_s));
+	fiemap_i.current_extent = 0;
+	fiemap_i.err = 0;
+
+	start_blk = fiemap_s->fm_start >> inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * Walk the extent tree gathering extent information
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	err = ext4_ext_walk_space(inode, start_blk, EXT_MAX_BLOCK - start_blk,
+				  ext4_ext_fiemap_cb, &fiemap_i);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (err)
+		goto out_free;
+
+	fiemap_s->fm_extent_count = fiemap_i.current_extent;
+	fiemap_s->fm_length = fiemap_i.tot_mapping_len;
+	/*
+	 * Mark last extent as EXTENT_LAST and copy the extent to userspace.`
+	 * XXX ERS fixme, this isn't always working.
+	 */
+	if (fiemap_i.current_extent != 0 &&
+	    fiemap_i.current_extent < fiemap_s->fm_extent_count &&
+	    !(fiemap_s->fm_flags & FIEMAP_FLAG_NUM_EXTENTS)) {
+		char *dest;
+
+		last_extent = &fiemap_i.fm_extent;
+		last_extent->fe_flags |= FIEMAP_EXTENT_LAST;
+		dest = (char *)arg + sizeof(*fiemap_s) + fm_extent_size *
+						(fiemap_s->fm_extent_count - 1);
+		err = copy_to_user(dest, last_extent, fm_extent_size);
+		if (err)
+			goto out_free;
+	}
+
+	err = copy_to_user((void *)arg, fiemap_s, sizeof(*fiemap_s));
+
+out_free:
+	kfree(fiemap_s);
+	return err;
+}
+
Index: linux-2.6/fs/ext4/namei.c
===================================================================
--- linux-2.6.orig/fs/ext4/namei.c	2008-06-05 13:44:20.537983099 -0500
+++ linux-2.6/fs/ext4/namei.c	2008-07-05 12:51:22.457290720 -0500
@@ -231,13 +231,13 @@ static inline unsigned dx_root_limit (st
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
Index: linux-2.6/fs/ext4/inode.c
===================================================================
--- linux-2.6.orig/fs/ext4/inode.c	2008-06-05 13:44:20.526046257 -0500
+++ linux-2.6/fs/ext4/inode.c	2008-07-05 12:51:24.322353933 -0500
@@ -38,6 +38,16 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+						   new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -181,6 +191,8 @@ void ext4_delete_inode (struct inode * i
 {
 	handle_t *handle;
 
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -508,11 +520,12 @@ static int ext4_blks_to_allocate(Indirec
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int indirect_blks, int blks,
-			ext4_fsblk_t new_blocks[4], int *err)
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				int indirect_blks, int blks,
+				ext4_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
-	unsigned long count = 0;
+	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
@@ -525,12 +538,13 @@ static int ext4_alloc_blocks(handle_t *h
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
-	target = blks + indirect_blks;
-
-	while (1) {
+	/* first we try to allocate the indirect blocks */
+	target = indirect_blks;
+	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+		current_block = ext4_new_meta_blocks(handle, inode,
+							goal, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -540,16 +554,48 @@ static int ext4_alloc_blocks(handle_t *h
 			new_blocks[index++] = current_block++;
 			count--;
 		}
-
-		if (count > 0)
+		if (count > 0) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+			printk(KERN_INFO "%s returned more blocks than "
+						"requested\n", __func__);
+			WARN_ON(1);
 			break;
+		}
 	}
 
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
+	target = blks - count ;
+	blk_allocated = count;
+	if (!target)
+		goto allocated;
+	/* Now allocate data blocks */
+	count = target;
+	/* allocating blocks for data blocks */
+	current_block = ext4_new_blocks(handle, inode, iblock,
+						goal, &count, err);
+	if (*err && (target == blks)) {
+		/*
+		 * if the allocation failed and we didn't allocate
+		 * any blocks before
+		 */
+		goto failed_out;
+	}
+	if (!*err) {
+		if (target == blks) {
+		/*
+		 * save the new block number
+		 * for the first direct block
+		 */
+			new_blocks[index] = current_block;
+		}
+		blk_allocated += count;
+	}
+allocated:
 	/* total number of blocks allocated for direct blocks */
-	ret = count;
+	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
@@ -584,8 +630,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			ext4_lblk_t *offsets, Indirect *branch)
+				ext4_lblk_t iblock, int indirect_blks,
+				int *blks, ext4_fsblk_t goal,
+				ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -595,7 +642,7 @@ static int ext4_alloc_branch(handle_t *h
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 
-	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
@@ -799,6 +846,7 @@ int ext4_get_blocks_handle(handle_t *han
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
+	loff_t disksize;
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +903,9 @@ int ext4_get_blocks_handle(handle_t *han
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+					&count, goal,
+					offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +922,13 @@ int ext4_get_blocks_handle(handle_t *han
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
+	if (!err && extend_disksize) {
+		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > ei->i_disksize)
+			ei->i_disksize = disksize;
+	}
 	if (err)
 		goto cleanup;
 
@@ -934,7 +988,7 @@ out:
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize)
+			int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -975,6 +1029,15 @@ int ext4_get_blocks_wrap(handle_t *handl
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flag)
+		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
@@ -996,11 +1059,12 @@ int ext4_get_blocks_wrap(handle_t *handl
 							~EXT4_EXT_MIGRATE;
 		}
 	}
+	if (flag)
+		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
-
-static int ext4_get_block(struct inode *inode, sector_t iblock,
+int ext4_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
@@ -1021,7 +1085,7 @@ static int ext4_get_block(struct inode *
 	}
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+					max_blocks, bh_result, create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1047,7 +1111,7 @@ struct buffer_head *ext4_getblk(handle_t
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1);
+					&dummy, create, 1, 0);
 	/*
 	 * ext4_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1267,20 @@ static int ext4_write_begin(struct file 
  	to = from + len;
 
 retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
-
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 
@@ -1225,8 +1290,8 @@ retry:
 	}
 
 	if (ret) {
-		ext4_journal_stop(handle);
  		unlock_page(page);
+		ext4_journal_stop(handle);
  		page_cache_release(page);
 	}
 
@@ -1236,15 +1301,6 @@ out:
 	return ret;
 }
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_data(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(__func__, __func__,
-						bh, handle, err);
-	return err;
-}
-
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1311,6 @@ static int write_end_fn(handle_t *handle
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
-}
-
-/*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
@@ -1290,15 +1323,14 @@ static int ext4_ordered_write_end(struct
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext4_journal_dirty_data);
+	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
 		/*
@@ -1311,7 +1343,7 @@ static int ext4_ordered_write_end(struct
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (ret2 < 0)
@@ -1320,8 +1352,6 @@ static int ext4_ordered_write_end(struct
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1332,7 +1362,7 @@ static int ext4_writeback_write_end(stru
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
@@ -1340,7 +1370,7 @@ static int ext4_writeback_write_end(stru
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
@@ -1349,8 +1379,6 @@ static int ext4_writeback_write_end(stru
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1389,14 +1417,569 @@ static int ext4_journalled_write_end(str
 			ret = ret2;
 	}
 
+	unlock_page(page);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
 	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ind_blks, dind_blks, tind_blks;
+
+	/* number of new indirect blocks needed */
+	ind_blks = (blocks + icap - 1) / icap;
+
+	dind_blks = (ind_blks + icap - 1) / icap;
+
+	tind_blks = 1;
+
+	return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_calc_metadata_amount(inode, blocks);
+
+	return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+	mdblocks = ext4_calc_metadata_amount(inode, total);
+	BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+	total = md_needed + nrblocks;
+
+	if (ext4_has_free_blocks(sbi, total) < total) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -ENOSPC;
+	}
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return 0;       /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int total, mdb, mdb_free, release;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	/* recalculate the number of metablocks still need to be reserved */
+	total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+	mdb = ext4_calc_metadata_amount(inode, total);
+
+	/* figure out how many metablocks to release */
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+	/* Account for allocated meta_blocks */
+	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+	release = to_free + mdb_free;
+
+	/* update fs free blocks counter for truncate case */
+	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+	/* update per-inode reservations */
+	BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	EXT4_I(inode)->i_allocated_meta_blocks = 0;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+						unsigned long offset)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+	ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	if ((ret == 0) && !buffer_delay(bh_result)) {
+		/* the block isn't (pre)allocated yet, let's reserve space */
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		ret = ext4_da_reserve_space(inode, 1);
+		if (ret)
+			/* not enough space to reserve */
+			return ret;
+
+		map_bh(bh_result, inode->i_sb, 0);
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+	} else if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+
+	return ret;
+}
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	if (!handle) {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+		BUG_ON(!ret);
+	} else {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	}
+
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/* release reserved-but-unused meta blocks */
+		if (buffer_delay(bh_result))
+			ext4_da_release_space(inode, ret, 0);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			up_write(&EXT4_I(inode)->i_data_sem);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				ret = ext4_mark_inode_dirty(handle, inode);
+				return ret;
+			}
+		}
+		ret = 0;
+	}
+	return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation
+	 */
+	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	/*
+	 * we don't want to do block allocation in writepage
+	 * so call get_block_wrap with create = 0
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
+	loff_t size;
+	unsigned long len;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		page_bufs = page_buffers(page);
+		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+					ext4_bh_unmapped_or_delay)) {
+			/*
+			 * We don't want to do  block allocation
+			 * So redirty the page and return
+			 * We may reach here when we do a journal commit
+			 * via journal_submit_inode_data_buffers.
+			 * If we don't have mapping block we just ignore
+			 * them. We can also reach here via shrink_page_list
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	} else {
+		/*
+		 * The test for page_has_buffers() is subtle:
+		 * We know the page is dirty but it lost buffers. That means
+		 * that at some moment in time after write_begin()/write_end()
+		 * has been called all buffers have been clean and thus they
+		 * must have been written at least once. So they are all
+		 * mapped and we can happily proceed with mapping them
+		 * and writing the page.
+		 *
+		 * Try to initialize the buffer_heads and check whether
+		 * all are mapped and non delay. We don't want to
+		 * do block allocation here.
+		 */
+		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+						ext4_normal_get_block_write);
+		if (!ret) {
+			page_bufs = page_buffers(page);
+			/* check whether all are mapped and non delay */
+			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+						ext4_bh_unmapped_or_delay)) {
+				redirty_page_for_writepage(wbc, page);
+				unlock_page(page);
+				return 0;
+			}
+		} else {
+			/*
+			 * We can't do block allocation here
+			 * so just redity the page and unlock
+			 * and return
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+	else
+		ret = block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+
+	return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	handle_t *handle = NULL;
+	int needed_blocks;
+	int ret = 0;
+	long to_write;
+	loff_t range_start = 0;
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages)
+		return 0;
+
+	/*
+	 * Estimate the worse case needed credits to write out
+	 * EXT4_MAX_BUF_BLOCKS pages
+	 */
+	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+	to_write = wbc->nr_to_write;
+	if (!wbc->range_cyclic) {
+		/*
+		 * If range_cyclic is not set force range_cont
+		 * and save the old writeback_index
+		 */
+		wbc->range_cont = 1;
+		range_start =  wbc->range_start;
+	}
+
+	while (!ret && to_write) {
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		if (ext4_should_order_data(inode)) {
+			/*
+			 * With ordered mode we need to add
+			 * the inode to the journal handle
+			 * when we do block allocation.
+			 */
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out_writepages;
+			}
+
+		}
+		/*
+		 * set the max dirty pages could be write at a time
+		 * to fit into the reserved transaction credits
+		 */
+		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+		to_write -= wbc->nr_to_write;
+		ret = mpage_da_writepages(mapping, wbc,
+						ext4_da_get_block_write);
+		ext4_journal_stop(handle);
+		if (wbc->nr_to_write) {
+			/*
+			 * There is no more writeout needed
+			 * or we requested for a noblocking writeout
+			 * and we found the device congested
+			 */
+			to_write += wbc->nr_to_write;
+			break;
+		}
+		wbc->nr_to_write = to_write;
+	}
+
+out_writepages:
+	wbc->nr_to_write = to_write;
+	if (range_start)
+		wbc->range_start = range_start;
+	return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+							ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					 unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i=0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied -1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
+
+				EXT4_I(inode)->i_disksize = new_i_size;
+			}
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
+	}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	ext4_da_page_release_reservation(page, offset);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -1412,12 +1995,22 @@ static int ext4_journalled_write_end(str
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
-static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2055,17 @@ static int bput_one(handle_t *handle, st
 	return 0;
 }
 
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (buffer_mapped(bh))
-		return ext4_journal_dirty_data(handle, bh);
-	return 0;
-}
-
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
  *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
@@ -1518,105 +2107,103 @@ static int jbd2_journal_dirty_data_fn(ha
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
  */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
 
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page,
+					ext4_normal_get_block_write, wbc);
+	else
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+}
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+static int ext4_normal_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
 	}
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
-
-	ret = block_write_full_page(page, ext4_get_block, wbc);
 
-	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
-	 */
-
-	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
-	 */
-	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, jbd2_journal_dirty_data_fn);
-		if (!ret)
-			ret = err;
-	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
+	if (!ext4_journal_current_handle())
+		return __ext4_normal_writepage(page, wbc);
 
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+					ext4_normal_get_block_write);
+	if (ret != 0)
+		goto out_unlock;
+
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+								bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
 
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_fail;
+		goto out;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
-	return ret;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	goto out;
+
+out_unlock:
 	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -1624,59 +2211,53 @@ static int ext4_journalled_writepage(str
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (ext4_journal_current_handle())
-		goto no_write;
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto no_write;
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
 	}
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
+	if (ext4_journal_current_handle())
+		goto no_write;
+
+	if (PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
+		return __ext4_journalled_writepage(page, wbc);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-
 no_write:
 	redirty_page_for_writepage(wbc, page);
-out_unlock:
 	unlock_page(page);
-	goto out;
+	return 0;
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2400,7 @@ static int ext4_journalled_set_page_dirt
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_ordered_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_ordered_write_end,
@@ -1833,7 +2414,7 @@ static const struct address_space_operat
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_writeback_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_writeback_write_end,
@@ -1857,10 +2438,31 @@ static const struct address_space_operat
 	.releasepage	= ext4_releasepage,
 };
 
+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_da_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.write_begin	= ext4_da_write_begin,
+	.write_end	= ext4_da_write_end,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (ext4_should_order_data(inode) &&
+		test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+		 test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
@@ -1873,7 +2475,7 @@ void ext4_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2484,13 @@ int ext4_block_truncate_page(handle_t *h
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	if (!page)
+		return -EINVAL;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2563,7 @@ int ext4_block_truncate_page(handle_t *h
 		err = ext4_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext4_should_order_data(inode))
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 
@@ -2347,7 +2954,6 @@ void ext4_truncate(struct inode *inode)
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
@@ -2357,41 +2963,21 @@ void ext4_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+		ext4_ext_truncate(inode);
 		return;
 	}
 
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
-	}
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
 
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
@@ -2410,6 +2996,11 @@ void ext4_truncate(struct inode *inode)
 		goto out_stop;
 
 	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	down_write(&ei->i_data_sem);
+	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3009,6 @@ void ext4_truncate(struct inode *inode)
 	 */
 	ei->i_disksize = inode->i_size;
 
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
-
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +3692,14 @@ int ext4_write_inode(struct inode *inode
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +3765,22 @@ int ext4_setattr(struct dentry *dentry, 
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
+
+		if (ext4_should_order_data(inode)) {
+			error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+			if (error) {
+				/* Do as much error cleanup as possible */
+				handle = ext4_journal_start(inode, 3);
+				if (IS_ERR(handle)) {
+					ext4_orphan_del(NULL, inode);
+					goto err_out;
+				}
+				ext4_orphan_del(handle, inode);
+				ext4_journal_stop(handle);
+				goto err_out;
+			}
+		}
 	}
 
 	rc = inode_setattr(inode, attr);
@@ -3506,3 +4114,64 @@ int ext4_change_inode_journal_flag(struc
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+	if (PageMappedToDisk(page))
+		goto out_unlock;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* return if we have all the buffers mapped */
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				       ext4_bh_unmapped))
+			goto out_unlock;
+	}
+	/*
+	 * OK, we need to fill the hole... Do write_begin write_end
+	 * to do block allocation/reservation.We are not holding
+	 * inode.i__mutex here. That allow * parallel write_begin,
+	 * write_end call. lock_page prevent this from happening
+	 * on the same page though
+	 */
+	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+			len, len, page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = 0;
+out_unlock:
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
Index: linux-2.6/fs/ext4/xattr.c
===================================================================
--- linux-2.6.orig/fs/ext4/xattr.c	2008-06-05 13:44:20.539983023 -0500
+++ linux-2.6/fs/ext4/xattr.c	2008-07-05 12:51:22.613291454 -0500
@@ -810,7 +810,7 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_block(handle, inode,
+			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
 							goal, &error);
 			if (error)
 				goto cleanup;
Index: linux-2.6/fs/ext4/fsync.c
===================================================================
--- linux-2.6.orig/fs/ext4/fsync.c	2008-06-05 13:44:20.524045910 -0500
+++ linux-2.6/fs/ext4/fsync.c	2008-07-05 12:51:22.702290943 -0500
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, s
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		if (journal && (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
 	return ret;
Index: linux-2.6/fs/ext4/resize.c
===================================================================
--- linux-2.6.orig/fs/ext4/resize.c	2008-06-24 14:23:40.900193882 -0500
+++ linux-2.6/fs/ext4/resize.c	2008-07-05 12:51:22.820291009 -0500
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *s
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
+	 * We can allocate memory for mb_alloc based on the new group
+	 * descriptor
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+		if (err)
+			goto exit_journal;
+	}
+	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
 	 * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
+	ext4_group_t group;
+	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block
 	}
 
 	/* Handle the remaining blocks in the last group only. */
-	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
 		ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
+
+	/*
+	 * Mark mballoc pages as not up to date so that they will be updated
+	 * next time they are loaded by ext4_mb_load_buddy.
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		struct inode *inode = sbi->s_buddy_cache;
+		int blocks_per_page;
+		int block;
+		int pnum;
+		struct page *page;
+
+		/* Set buddy page as not up to date */
+		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+		block = group * 2;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Set bitmap page as not up to date */
+		block++;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Get the info on the last group */
+		grp = ext4_get_group_info(sb, group);
+
+		/* Update free blocks in group info */
+		ext4_mb_update_group_info(grp, add);
+	}
+
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
Index: linux-2.6/Documentation/filesystems/ext4.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/ext4.txt	2008-06-16 11:34:05.825983154 -0500
+++ linux-2.6/Documentation/filesystems/ext4.txt	2008-07-05 12:51:24.014291691 -0500
@@ -13,72 +13,89 @@ Mailing list: linux-ext4@vger.kernel.org
 1. Quick usage instructions:
 ===========================
 
-  - Grab updated e2fsprogs from
+  - Compile and install the latest version of e2fsprogs (at least
+    1.41-WIP-0617) from:
+
     ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
-    This is a patchset on top of e2fsprogs-1.39, which can be found at
-    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
 
-  - It's still mke2fs -j /dev/hda1
+    or grab the latest git repository from:
+
+    git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+  - Create a new filesystem using the ext4dev filesystem type:
+
+    	# mke2fs -t ext4dev /dev/hda1
+
+    Or configure an existing ext3 filesystem to support extents and set
+    the test_fs flag to indicate that it's ok for an in-development
+    filesystem to touch this filesystem:
+
+	# tune2fs -O extents -E test_fs /dev/hda1
 
-  - mount /dev/hda1 /wherever -t ext4dev
+    If the filesystem was created with 128 byte inodes, it can be
+    converted to use 256 byte for greater efficiency via:
 
-  - To enable extents,
+        # tune2fs -I 256 /dev/hda1
 
-	mount /dev/hda1 /wherever -t ext4dev -o extents
+    (Note: we currently do not have tools to convert an ext4dev
+    filesystem back to ext3; so please do not do try this on production
+    filesystems.)
 
-  - The filesystem is compatible with the ext3 driver until you add a file
-    which has extents (ie: `mount -o extents', then create a file).
+  - Mounting:
 
-    NOTE: The "extents" mount flag is temporary.  It will soon go away and
-    extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+	# mount -t ext4dev /dev/hda1 /wherever
 
   - When comparing performance with other filesystems, remember that
-    ext3/4 by default offers higher data integrity guarantees than most.  So
-    when comparing with a metadata-only journalling filesystem, use `mount -o
-    data=writeback'.  And you might as well use `mount -o nobh' too along
-    with it.  Making the journal larger than the mke2fs default often helps
-    performance with metadata-intensive workloads.
+    ext3/4 by default offers higher data integrity guarantees than most.
+    So when comparing with a metadata-only journalling filesystem, such
+    as ext3, use `mount -o data=writeback'.  And you might as well use
+    `mount -o nobh' too along with it.  Making the journal larger than
+    the mke2fs default often helps performance with metadata-intensive
+    workloads.
 
 2. Features
 ===========
 
 2.1 Currently available
 
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
 * extent format reduces metadata overhead (RAM, IO for access, transactions)
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+  flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+  the ordering)
 
 2.2 Candidate features for future inclusion
 
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+  the uninit_bg feature (capability to do this is available in e2fsprogs
+  but a kernel thread to do lazy zeroing of unused inode table blocks
+  after filesystem is first mounted is required for safety)
+
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
 
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
-  needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables.  Some test results available here:
 
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
-
-The big performance win will come with mballoc and delalloc.  CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it.  The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes.  I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
 
 3. Options
 ==========
@@ -222,9 +239,11 @@ stripe=n		Number of filesystem blocks th
 			to use for allocation size and alignment. For RAID5/6
 			systems this should be the number of data
 			disks *  RAID chunk size in file system blocks.
-
+delalloc	(*)	Deferring block allocation until write-out time.
+nodelalloc		Disable delayed allocation. Blocks are allocation
+			when data is copied from user to page cache.
 Data Mode
----------
+=========
 There are 3 different data modes:
 
 * writeback mode
@@ -236,10 +255,10 @@ typically provide the best ext4 performa
 
 * ordered mode
 In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction.  When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first.  In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction.  When it's time to write the new metadata
+out to disk, the associated data blocks are written first.  In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
 
 * journal mode
 data=journal mode provides full data and metadata journaling.  All new data is
@@ -247,7 +266,8 @@ written to the journal first, and then t
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes.  Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
 
 References
 ==========
@@ -256,7 +276,8 @@ kernel source:	<file:fs/ext4/>
 		<file:fs/jbd2/>
 
 programs:	http://e2fsprogs.sourceforge.net/
-		http://ext2resize.sourceforge.net
 
 useful links:	http://fedoraproject.org/wiki/ext3-devel
 		http://www.bullopensource.org/ext4/
+		http://ext4.wiki.kernel.org/index.php/Main_Page
+		http://fedoraproject.org/wiki/Features/Ext4
Index: linux-2.6/fs/ext4/file.c
===================================================================
--- linux-2.6.orig/fs/ext4/file.c	2008-06-05 13:44:20.524045910 -0500
+++ linux-2.6/fs/ext4/file.c	2008-07-05 12:51:24.681390891 -0500
@@ -123,6 +123,26 @@ force_commit:
 	return ret;
 }
 
+static struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
+/* XXX ERS should this go into this file? */
+extern int ext4_fiemap(struct inode *inode, unsigned long arg);
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -133,7 +153,7 @@ const struct file_operations ext4_file_o
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext4_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
@@ -152,5 +172,6 @@ const struct inode_operations ext4_file_
 #endif
 	.permission	= ext4_permission,
 	.fallocate	= ext4_fallocate,
+	.fiemap		= ext4_fiemap,
 };
 
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2008-06-05 13:44:20.004983761 -0500
+++ linux-2.6/fs/buffer.c	2008-07-05 12:51:23.293291194 -0500
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struc
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh))
+			    && buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh)
+			&& !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file,
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
+		i_size_changed = 1;
 	}
 
 	unlock_page(page);
 	page_cache_release(page);
 
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2008-06-16 11:34:12.718481621 -0500
+++ linux-2.6/include/linux/fs.h	2008-07-05 12:51:24.606302244 -0500
@@ -228,6 +228,7 @@ extern int dir_notify_enable;
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
 #define	FS_IOC_SETVERSION		_IOW('v', 2, long)
+#define FS_IOC_FIEMAP			_IOWR('f', 10, struct fiemap)
 #define FS_IOC32_GETFLAGS		_IOR('f', 1, int)
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
@@ -288,6 +289,7 @@ extern int dir_notify_enable;
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
+#include <linux/fiemap.h>
 
 #include <asm/atomic.h>
 #include <asm/byteorder.h>
@@ -1273,6 +1275,7 @@ struct inode_operations {
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
+	int (*fiemap) (struct inode *, unsigned long arg);
 };
 
 struct seq_file;
@@ -1741,6 +1744,8 @@ extern int wait_on_page_writeback_range(
 				pgoff_t start, pgoff_t end);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 
 extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2008-06-05 13:44:44.127982824 -0500
+++ linux-2.6/mm/filemap.c	2008-07-05 12:51:23.065702996 -0500
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_sp
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
+EXPORT_SYMBOL(filemap_fdatawrite_range);
 
 /**
  * filemap_flush - mostly a non-blocking flush
Index: linux-2.6/fs/jbd2/journal.c
===================================================================
--- linux-2.6.orig/fs/jbd2/journal.c	2008-06-05 13:44:21.035983743 -0500
+++ linux-2.6/fs/jbd2/journal.c	2008-07-05 12:51:23.236291490 -0500
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_update
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_pa
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struc
 }
 
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
+/*
  * debugfs tunables
  */
 #ifdef CONFIG_JBD2_DEBUG
Index: linux-2.6/fs/ext4/ext4_jbd2.h
===================================================================
--- linux-2.6.orig/fs/ext4/ext4_jbd2.h	2008-06-05 13:44:20.522046081 -0500
+++ linux-2.6/fs/ext4/ext4_jbd2.h	2008-07-05 12:51:23.184353602 -0500
@@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const 
 #define ext4_journal_forget(handle, bh) \
 	__ext4_journal_forget(__FUNCTION__, (handle), (bh))
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
-
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_com
 	return jbd2_journal_force_commit(journal);
 }
 
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
Index: linux-2.6/fs/jbd2/checkpoint.c
===================================================================
--- linux-2.6.orig/fs/jbd2/checkpoint.c	2008-06-05 13:44:21.034983059 -0500
+++ linux-2.6/fs/jbd2/checkpoint.c	2008-07-05 12:51:23.226291217 -0500
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(jou
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c	2008-06-05 13:44:21.442983821 -0500
+++ linux-2.6/fs/mpage.c	2008-07-05 12:51:23.871291021 -0500
@@ -10,6 +10,8 @@
  *		Initial version
  * 27Jun2002	axboe@suse.de
  *		use bio_add_page() to build bio's just the right size
+ * 26Jul2007	alex@clusterfs.com AKA bzzz
+ *		basic delayed allocation support
  */
 
 #include <linux/kernel.h>
@@ -710,3 +712,404 @@ int mpage_writepage(struct page *page, g
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepage);
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct buffer_head lbh;			/* extent of blocks */
+	unsigned long first_page, next_page;	/* extent of pages */
+	get_block_t *get_block;
+	struct writeback_control *wbc;
+};
+
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct mpage_data mpd_pp = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = mpd->get_block,
+		.use_writepage = 1,
+	};
+	int ret = 0, err, nr_pages, i;
+	unsigned long index, end;
+	struct pagevec pvec;
+
+	BUG_ON(mpd->next_page <= mpd->first_page);
+
+	pagevec_init(&pvec, 0);
+	index = mpd->first_page;
+	end = mpd->next_page - 1;
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 * XXX: unlock and re-dirty them?
+			 */
+			if (ret == 0)
+				ret = err;
+		}
+		pagevec_release(&pvec);
+	}
+	if (mpd_pp.bio)
+		mpage_bio_submit(WRITE, mpd_pp.bio);
+
+	return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+				 struct buffer_head *exbh)
+{
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+	int blocks = exbh->b_size >> inode->i_blkbits;
+	sector_t pblock = exbh->b_blocknr, cur_logical;
+	struct buffer_head *head, *bh;
+	unsigned long index, end;
+	struct pagevec pvec;
+	int nr_pages, i;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			BUG_ON(!page_has_buffers(page));
+
+			bh = page_buffers(page);
+			head = bh;
+
+			/* skip blocks out of the range */
+			do {
+				if (cur_logical >= logical)
+					break;
+				cur_logical++;
+			} while ((bh = bh->b_this_page) != head);
+
+			do {
+				if (cur_logical >= logical + blocks)
+					break;
+				if (buffer_delay(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_delay(bh);
+				} else if (buffer_mapped(bh))
+					BUG_ON(bh->b_blocknr != pblock);
+
+				cur_logical++;
+				pblock++;
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+					     struct buffer_head *bh)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	int blocks, i;
+
+	blocks = bh->b_size >> inode->i_blkbits;
+	for (i = 0; i < blocks; i++)
+		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	int err = 0, remain = lbh->b_size;
+	sector_t next = lbh->b_blocknr;
+	struct buffer_head new;
+
+	/*
+	 * We consider only non-mapped and non-allocated blocks
+	 */
+	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+		return;
+
+	while (remain) {
+		new.b_state = lbh->b_state;
+		new.b_blocknr = 0;
+		new.b_size = remain;
+		err = mpd->get_block(mpd->inode, next, &new, 1);
+		if (err) {
+			/*
+			 * Rather than implement own error handling
+			 * here, we just leave remaining blocks
+			 * unallocated and try again with ->writepage()
+			 */
+			break;
+		}
+		BUG_ON(new.b_size == 0);
+
+		if (buffer_new(&new))
+			__unmap_underlying_blocks(mpd->inode, &new);
+
+		/*
+		 * If blocks are delayed marked, we need to
+		 * put actual blocknr and drop delayed bit
+		 */
+		if (buffer_delay(lbh))
+			mpage_put_bnr_to_bhs(mpd, next, &new);
+
+		/* go for the remaining blocks */
+		next += new.b_size >> mpd->inode->i_blkbits;
+		remain -= new.b_size;
+	}
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+				   sector_t logical, struct buffer_head *bh)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
+
+	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+	/*
+	 * First block in the extent
+	 */
+	if (lbh->b_size == 0) {
+		lbh->b_blocknr = logical;
+		lbh->b_size = bh->b_size;
+		lbh->b_state = bh->b_state & BH_FLAGS;
+		return;
+	}
+
+	/*
+	 * Can we merge the block to our big extent?
+	 */
+	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+		lbh->b_size += bh->b_size;
+		return;
+	}
+
+	/*
+	 * We couldn't merge the block to our extent, so we
+	 * need to flush current  extent and start new one
+	 */
+	mpage_da_map_blocks(mpd);
+
+	/*
+	 * Now start a new extent
+	 */
+	lbh->b_size = bh->b_size;
+	lbh->b_state = bh->b_state & BH_FLAGS;
+	lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+				struct writeback_control *wbc, void *data)
+{
+	struct mpage_da_data *mpd = data;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *bh, *head, fake;
+	sector_t logical;
+
+	/*
+	 * Can we merge this page to current extent?
+	 */
+	if (mpd->next_page != page->index) {
+		/*
+		 * Nope, we can't. So, we map non-allocated blocks
+		 * and start IO on them using __mpage_writepage()
+		 */
+		if (mpd->next_page != mpd->first_page) {
+			mpage_da_map_blocks(mpd);
+			mpage_da_submit_io(mpd);
+		}
+
+		/*
+		 * Start next extent of pages ...
+		 */
+		mpd->first_page = page->index;
+
+		/*
+		 * ... and blocks
+		 */
+		mpd->lbh.b_size = 0;
+		mpd->lbh.b_state = 0;
+		mpd->lbh.b_blocknr = 0;
+	}
+
+	mpd->next_page = page->index + 1;
+	logical = (sector_t) page->index <<
+		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * There is no attached buffer heads yet (mmap?)
+		 * we treat the page asfull of dirty blocks
+		 */
+		bh = &fake;
+		bh->b_size = PAGE_CACHE_SIZE;
+		bh->b_state = 0;
+		set_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		mpage_add_bh_to_extent(mpd, logical, bh);
+	} else {
+		/*
+		 * Page with regular buffer heads, just add all dirty ones
+		 */
+		head = page_buffers(page);
+		bh = head;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (buffer_dirty(bh))
+				mpage_add_bh_to_extent(mpd, logical, bh);
+			logical++;
+		} while ((bh = bh->b_this_page) != head);
+	}
+
+	return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+int mpage_da_writepages(struct address_space *mapping,
+			struct writeback_control *wbc, get_block_t get_block)
+{
+	struct mpage_da_data mpd;
+	int ret;
+
+	if (!get_block)
+		return generic_writepages(mapping, wbc);
+
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+	mpd.lbh.b_size = 0;
+	mpd.lbh.b_state = 0;
+	mpd.lbh.b_blocknr = 0;
+	mpd.first_page = 0;
+	mpd.next_page = 0;
+	mpd.get_block = get_block;
+
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+	/*
+	 * Handle last extent of pages
+	 */
+	if (mpd.next_page != mpd.first_page) {
+		mpage_da_map_blocks(&mpd);
+		mpage_da_submit_io(&mpd);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(mpage_da_writepages);
Index: linux-2.6/include/linux/mpage.h
===================================================================
--- linux-2.6.orig/include/linux/mpage.h	2008-06-05 13:44:40.006046000 -0500
+++ linux-2.6/include/linux/mpage.h	2008-07-05 12:51:23.376568065 -0500
@@ -18,6 +18,8 @@ int mpage_readpages(struct address_space
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
+int mpage_da_writepages(struct address_space *mapping,
+		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
 		struct writeback_control *wbc);
 
Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h	2008-06-05 13:44:40.321982718 -0500
+++ linux-2.6/include/linux/percpu_counter.h	2008-07-05 12:51:23.460291452 -0500
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percp
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(st
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2008-06-05 13:44:44.008985568 -0500
+++ linux-2.6/lib/percpu_counter.c	2008-07-05 12:51:23.540290552 -0500
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_c
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
+		if (set)
+			*pcount = 0;
 	}
+	if (set)
+		fbc->count = ret;
+
 	spin_unlock(&fbc->lock);
 	return ret;
 }
Index: linux-2.6/fs/ext4/ext4_extents.h
===================================================================
--- linux-2.6.orig/fs/ext4/ext4_extents.h	2008-06-05 13:44:20.522046081 -0500
+++ linux-2.6/fs/ext4/ext4_extents.h	2008-07-05 12:51:24.715294680 -0500
@@ -124,6 +124,19 @@ struct ext4_ext_path {
 #define EXT4_EXT_CACHE_GAP	1
 #define EXT4_EXT_CACHE_EXTENT	2
 
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
 
 #define EXT_MAX_BLOCK	0xffffffff
 
@@ -212,6 +225,7 @@ static inline int ext4_ext_get_actual_le
 		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
@@ -221,6 +235,7 @@ extern int ext4_ext_try_to_merge(struct 
 				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
@@ -228,5 +243,15 @@ extern int ext4_ext_search_left(struct i
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *path);
+extern ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+					struct ext4_ext_path *path,
+					ext4_lblk_t block);
+extern int ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode,
+					struct ext4_ext_path *path,
+					struct ext4_extent *newext, int defrag);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+
 #endif /* _EXT4_EXTENTS */
 
Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h	2008-06-05 13:44:41.026983805 -0500
+++ linux-2.6/include/linux/writeback.h	2008-07-05 12:51:23.813471334 -0500
@@ -63,6 +63,7 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned range_cont:1;
 };
 
 /*
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2008-06-05 13:44:44.152982933 -0500
+++ linux-2.6/mm/page-writeback.c	2008-07-05 12:51:23.846292034 -0500
@@ -956,6 +956,9 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+
+	if (wbc->range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
Index: linux-2.6/fs/ext4/Makefile
===================================================================
--- linux-2.6.orig/fs/ext4/Makefile	2008-06-05 13:44:20.520046428 -0500
+++ linux-2.6/fs/ext4/Makefile	2008-07-05 12:51:24.081329189 -0500
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		   ext4_jbd2.o migrate.o mballoc.o defrag.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
Index: linux-2.6/fs/ext4/defrag.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/fs/ext4/defrag.c	2008-07-05 12:51:24.454291246 -0500
@@ -0,0 +1,2120 @@
+/*
+ * Copyright (c) 2008, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/* Online defragmentation for EXT4 */
+
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "group.h"
+
+#define EXT_SET_EXTENT_DATA(src, dest)  do {			\
+		dest.block = le32_to_cpu(src->ee_block);	\
+		dest.start = ext_pblock(src);			\
+		dest.len = le16_to_cpu(src->ee_len);		\
+					} while (0)
+
+/**
+ * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * This function returns 0 or 1(last entry) if succeed, otherwise
+ * returns -EIO.
+ */
+static int
+ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
+			struct ext4_extent **extent)
+{
+	int ppos, leaf_ppos = path->p_depth;
+
+	ppos = leaf_ppos;
+	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+		/* leaf block */
+		*extent = ++path[ppos].p_ext;
+		return 0;
+	}
+
+	while (--ppos >= 0) {
+		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+		    path[ppos].p_idx) {
+			int cur_ppos = ppos;
+
+			/* index block */
+			path[ppos].p_idx++;
+			path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+			if (path[ppos+1].p_bh)
+				brelse(path[ppos+1].p_bh);
+			path[ppos+1].p_bh =
+				sb_bread(inode->i_sb, path[ppos].p_block);
+			if (!path[ppos+1].p_bh)
+				goto err;
+			path[ppos+1].p_hdr =
+				ext_block_hdr(path[ppos+1].p_bh);
+
+			/* Halfway index block */
+			while (++cur_ppos < leaf_ppos) {
+				path[cur_ppos].p_idx =
+					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+				path[cur_ppos].p_block =
+					idx_pblock(path[cur_ppos].p_idx);
+				if (path[cur_ppos+1].p_bh)
+					brelse(path[cur_ppos+1].p_bh);
+				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+					path[cur_ppos].p_block);
+				if (!path[cur_ppos+1].p_bh)
+					goto err;
+				path[cur_ppos+1].p_hdr =
+					ext_block_hdr(path[cur_ppos+1].p_bh);
+			}
+
+			/* leaf block */
+			path[leaf_ppos].p_ext = *extent =
+				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			return 0;
+		}
+	}
+	/* We found the last extent */
+	return 1;
+err:
+	if (path)
+		ext4_ext_drop_refs(path);
+	return -EIO;
+}
+
+/**
+ * ext4_defrag_extents_info - Get extents information
+ *
+ * @sb:				for ext4_iget()
+ * @ext_info:			pointer to ext4_extents_info
+ *  @ext_info->ino:		describe an inode which is used to get
+ *				extent information
+ *  @ext_info->max_entries:	defined by DEFRAG_MAX_ENT
+ *  @ext_info->entries:		amount of extents (output)
+ *  @ext_info->ext[]:		array of extent (output)
+ *  @ext_info->offset:		starting block offset of targeted extent
+ *				(file relative)
+ *
+ * This function returns 0 if the next extent(s) exists,
+ * or returns 1 if the next extent doesn't exist,
+ * otherwise returns error value.
+ */
+static int
+ext4_defrag_extents_info(struct super_block *sb,
+				struct ext4_extents_info *ext_info)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ext = NULL;
+	struct inode *inode = NULL;
+	ext4_lblk_t offset = ext_info->f_offset;
+	int max_entries = ext_info->max_entries;
+	int depth, entries = 0;
+	int err = 0;
+	int ret = 0;
+
+	inode = ext4_iget(sb, ext_info->ino);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	/* Return -ENOENT if a file does not exist */
+	if (!inode->i_nlink || inode->i_ino < EXT4_GOOD_OLD_FIRST_INO ||
+			!S_ISREG(inode->i_mode)) {
+		ext_info->entries = 0;
+		err = -ENOENT;
+		goto out;
+	}
+
+	path = ext4_ext_find_extent(inode, offset, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out;
+	}
+	depth = ext_depth(inode);
+
+	/* Skip the 0 size file */
+	if (path[depth].p_ext == NULL) {
+		ext_info->entries = 0;
+		goto out;
+	}
+	ext = path[depth].p_ext;
+	EXT_SET_EXTENT_DATA(ext, ext_info->ext[entries]);
+	entries = 1;
+
+	/*
+	 * The ioctl repeats this loop 'max_entries' times.
+	 * So we have to call this function again if @inode had
+	 * more the number of extents than 'max_entries'.
+	 */
+	while (entries < max_entries) {
+		ret = ext4_defrag_next_extent(inode, path, &ext);
+		if (ret == 0) {
+			/* Found the next extent (it means not the last one) */
+			EXT_SET_EXTENT_DATA(ext, ext_info->ext[entries]);
+			entries++;
+
+			/*
+			 * In case @inode has > 'max_entries' extents,
+			 * we must call this function again and restart from
+			 * 'max_entries * n + 1'th extent.
+			 * 'n' is the number of calling this function
+			 * at the same @inode.
+			 */
+			if (entries == max_entries) {
+				ext_info->f_offset =
+						le32_to_cpu(ext->ee_block) +
+						le16_to_cpu(ext->ee_len);
+				/* Check the extent is the last one or not */
+				ret =
+				    ext4_defrag_next_extent(inode, path, &ext);
+				if (ret == 1) {
+					err = ret;
+				} else if (ret < 0) {
+					/* Failed to get the next extent */
+					err = ret;
+					goto out;
+				}
+				break;
+			}
+
+		} else if (ret == 1) {
+			/* The extent is the last one */
+			ext_info->f_offset = 0;
+			err = ret;
+			break;
+		} else {
+			/* Failed to get the next extent */
+			err = ret;
+			goto out;
+		}
+	}
+
+	ext_info->entries = entries;
+
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	up_write(&EXT4_I(inode)->i_data_sem);
+	iput(inode);
+	return err;
+}
+
+/**
+ * ext4_defrag_reserve_blocks - Reserve blocks for defrag
+ *
+ * @org_inode:	original inode
+ * @goal:	the goal offset of the block reservation
+ * @len:	blocks count we need to reserve
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+
+static int
+ext4_defrag_reserve_blocks(struct inode *org_inode, ext4_fsblk_t goal, int len)
+{
+	struct super_block *sb = NULL;
+	handle_t *handle;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_block_alloc_info *block_i;
+	struct ext4_reserve_window_node *my_rsv = NULL;
+	unsigned short windowsz = 0;
+	ext4_group_t group_no;
+	ext4_grpblk_t grp_target_blk;
+	int err = 0;
+
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	handle = ext4_journal_start(org_inode, EXT4_RESERVE_TRANS_BLOCKS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	if (S_ISREG(org_inode->i_mode) &&
+	    !EXT4_I(org_inode)->i_block_alloc_info) {
+		ext4_init_block_alloc_info(org_inode);
+	} else if (!S_ISREG(org_inode->i_mode)) {
+		printk(KERN_ERR "ext4 defrag: Invalid file type\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	sb = org_inode->i_sb;
+	if (!sb) {
+		printk(KERN_ERR "ext4 defrag: Non-existent device\n");
+		err = -ENXIO;
+		goto out;
+	}
+	ext4_get_group_no_and_offset(sb, goal, &group_no,
+				&grp_target_blk);
+
+	block_i = EXT4_I(org_inode)->i_block_alloc_info;
+	/* Block reservation should be enabled */
+	BUG_ON(!block_i);
+
+	windowsz = block_i->rsv_window_node.rsv_goal_size;
+	/* Goal size should be set */
+	BUG_ON(!windowsz);
+
+	my_rsv = &block_i->rsv_window_node;
+
+	bitmap_bh = ext4_read_block_bitmap(sb, group_no);
+	if (!bitmap_bh) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+	err = ext4_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto out;
+
+	err = alloc_new_reservation(my_rsv, grp_target_blk, sb,
+						group_no, bitmap_bh);
+	if (err < 0) {
+		printk(KERN_ERR "ext4 defrag: Block reservation failed."
+			"offset [%d], bg[%lu]\n", grp_target_blk, group_no);
+		ext4_discard_reservation(org_inode);
+		goto out;
+	} else if (len > EXT4_DEFAULT_RESERVE_BLOCKS) {
+		try_to_extend_reservation(my_rsv, sb,
+			len - EXT4_DEFAULT_RESERVE_BLOCKS);
+	}
+
+out:
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	ext4_journal_release_buffer(handle, bitmap_bh);
+	brelse(bitmap_bh);
+
+	if (handle)
+		ext4_journal_stop(handle);
+
+	return err;
+}
+
+/**
+ * ext4_defrag_block_within_rsv - Is target extent reserved ?
+ *
+ * @org_inode:	original inode
+ * @ex_start:	physical block offset of the extent which already moved
+ * @ex_len:	block length of the extent
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_block_within_rsv(struct inode *org_inode, ext4_fsblk_t ex_start,
+				int ex_len)
+{
+	struct super_block *sb = org_inode->i_sb;
+	struct ext4_block_alloc_info *block_i;
+	ext4_group_t group_no;
+	ext4_grpblk_t grp_blk;
+	struct ext4_reserve_window_node *rsv;
+
+	block_i = EXT4_I(org_inode)->i_block_alloc_info;
+	/* Block reservation should be enabled */
+	BUG_ON(!block_i);
+
+	/* Goal size should be set */
+	BUG_ON(!block_i->rsv_window_node.rsv_goal_size);
+
+	rsv = &block_i->rsv_window_node;
+	if (rsv_is_empty(&rsv->rsv_window)) {
+		printk(KERN_ERR "ext4 defrag: Reservation window is empty\n");
+		return -ENOSPC;
+	}
+
+	ext4_get_group_no_and_offset(sb, ex_start, &group_no, &grp_blk);
+
+	if (!goal_in_my_reservation(&rsv->rsv_window, grp_blk, group_no, sb)
+	    || !goal_in_my_reservation(&rsv->rsv_window,
+			grp_blk + ex_len - 1, group_no, sb)){
+		/* Goal blocks are not in the reservation window */
+		printk(KERN_ERR "ext4 defrag: %d or %d in bg %lu is "
+				"not in rsv_window\n", grp_blk,
+				grp_blk + ex_len - 1, group_no);
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+/*
+ * ext4_defrag_reserve_fblocks -
+ * 		Reserve free blocks with ext4_defrag_reserve_blocks
+ *
+ * @org_inode:		original inode to get a block group number
+ * @ext_info:		freeblocks distribution which stored extent-like style
+ *  @ext_info->ext[]:	an array of struct ext4_extents_data
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_reserve_fblocks(struct inode *org_inode,
+			struct ext4_extents_info *ext_info)
+{
+	ext4_fsblk_t ex_start = 0;
+	int i, len, ret;
+
+	for (i = 0; i < ext_info->entries; i++) {
+		ex_start = ext_info->ext[i].start;
+		len = ext_info->ext[i].len;
+
+		ret = ext4_defrag_reserve_blocks(org_inode, ex_start, len);
+		if (ret < 0) {
+			printk(KERN_ERR "ext4 defrag: "
+				"Block reservation failed. offset [%llu], "
+				"length [%d]\n", ex_start, len);
+			goto err;
+		}
+
+		/* Confirm that blocks are in the reservation window */
+		ret = ext4_defrag_block_within_rsv(org_inode, ex_start, len);
+		if (ret < 0) {
+			printk(KERN_ERR "ext4 defrag: "
+				"Reservation window is not set. "
+				"offset [%llu], length [%d]\n", ex_start, len);
+			goto err;
+		}
+	}
+	return ret;
+
+err:
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+	ext4_discard_reservation(org_inode);
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	return ret;
+}
+
+/**
+ * ext4_defrag_move_victim - Create free space for defrag
+ *
+ * @target_filp:	target file
+ * @ext_info:		target extents array to move
+ *
+ * This function returns 0 if succeed, otherwise
+ * returns error value.
+ */
+static int
+ext4_defrag_move_victim(struct file *target_filp,
+			struct ext4_extents_info *ext_info)
+{
+	struct inode *org_inode = target_filp->f_dentry->d_inode;
+	struct super_block *sb = org_inode->i_sb;
+	struct file victim_file;
+	struct dentry victim_dent;
+	struct inode *victim_inode;
+	struct ext4_extent_data ext;
+	ext4_fsblk_t goal = ext_info->goal;
+	ext4_group_t group;
+	ext4_grpblk_t grp_off;
+	int ret, i;
+
+	/* Setup dummy extent data */
+	ext.len = 0;
+
+	/* Get the inode of the victim file */
+	victim_inode = ext4_iget(sb, ext_info->ino);
+	if (IS_ERR(victim_inode))
+		return PTR_ERR(victim_inode);
+
+	/* Setup file for the victim file */
+	victim_dent.d_inode = victim_inode;
+	victim_file.f_dentry = &victim_dent;
+	victim_file.f_mapping = victim_inode->i_mapping;
+
+	/* Set the goal appropriate offset */
+	if (goal == -1) {
+		ext4_get_group_no_and_offset(victim_inode->i_sb,
+				ext_info->ext[0].start, &group, &grp_off);
+		goal = ext4_group_first_block_no(sb, group + 1);
+	}
+
+	for (i = 0; i < ext_info->entries; i++) {
+		/* Move original blocks to another block group */
+		ret = ext4_defrag(&victim_file, ext_info->ext[i].block,
+			ext_info->ext[i].len, goal, DEFRAG_FORCE_VICTIM, &ext);
+		if (ret < 0) {
+			printk(KERN_ERR "ext4 defrag: "
+				"Moving victim file failed. ino [%llu]\n",
+				ext_info->ino);
+			goto err;
+		}
+
+		/* Sync journal blocks before reservation */
+		ret = ext4_force_commit(sb);
+		if (ret) {
+			printk(KERN_ERR "ext4 defrag: "
+				"ext4_force_commit failed(%d)\n", ret);
+			goto err;
+		}
+	}
+
+	iput(victim_inode);
+	return 0;
+err:
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+	ext4_discard_reservation(org_inode);
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	iput(victim_inode);
+	return ret;
+}
+
+/**
+ * ext4_defrag_fblocks_distribution - Search free blocks distribution
+ *
+ * @org_inode:	original inode
+ * @ext_info:	ext4_extents_info
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_fblocks_distribution(struct inode *org_inode,
+			struct ext4_extents_info *ext_info)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct super_block *sb = org_inode->i_sb;
+	handle_t *handle;
+	ext4_group_t group_no;
+	ext4_grpblk_t start, end;
+	ext4_fsblk_t start_block = 0;
+	int i, err;
+	int num = 0;
+	int len = 0;
+	int block_set = 0;
+	int extra_block = 0;
+
+	if (!sb) {
+		printk(KERN_ERR "ext4 defrag: Non-existent device\n");
+		return -ENOSPC;
+	}
+
+	group_no = (org_inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	start = ext_info->g_offset;
+	end = EXT4_BLOCKS_PER_GROUP(sb) - 1;
+
+	/* We consider about the boot block if bs = 1k */
+	if (sb->s_blocksize == 1024)
+		extra_block = 1;
+
+	handle = ext4_journal_start(org_inode, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		return err;
+	}
+
+	bitmap_bh = ext4_read_block_bitmap(sb, group_no);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+	err = ext4_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto out;
+
+	for (i = start; i <= end ; i++) {
+		if (bitmap_search_next_usable_block(i, bitmap_bh, i + 1) >= 0) {
+			len++;
+			/*
+			 * Reset start_block if the free block is
+			 * the head of region.
+			 */
+			if (!block_set) {
+				start_block =
+				 i + group_no * EXT4_BLOCKS_PER_GROUP(sb) +
+				 extra_block;
+				block_set = 1;
+			}
+		} else if (len) {
+			ext_info->ext[num].start = start_block;
+			ext_info->ext[num].len = len;
+			num++;
+			len = 0;
+			block_set = 0;
+			if (num == ext_info->max_entries) {
+				ext_info->g_offset = i + 1;
+				break;
+			}
+		}
+		if (i == end && len) {
+			ext_info->ext[num].start = start_block;
+			ext_info->ext[num].len = len;
+			num++;
+		}
+	}
+
+	ext_info->entries = num;
+out:
+	ext4_journal_release_buffer(handle, bitmap_bh);
+	brelse(bitmap_bh);
+
+	if (handle)
+		ext4_journal_stop(handle);
+
+	return err;
+}
+
+int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+			unsigned long arg)
+{
+	int err = 0;
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL ||
+					cmd == EXT4_IOC_FIBMAP)) {
+		printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+					"based file\n", inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if (cmd == EXT4_IOC_FIBMAP) {
+		ext4_fsblk_t __user *p = (ext4_fsblk_t __user *)arg;
+		ext4_fsblk_t block = 0;
+		struct address_space *mapping = filp->f_mapping;
+
+		if (copy_from_user(&block, (ext4_fsblk_t __user *)arg,
+					sizeof(block)))
+			return -EFAULT;
+
+		block = ext4_bmap(mapping, block);
+
+		return put_user(block, p);
+	} else if (cmd == EXT4_IOC_GROUP_INFO) {
+		struct ext4_group_data_info grp_data;
+
+		if (copy_from_user(&grp_data,
+			(struct ext4_group_data_info __user *)arg,
+			sizeof(grp_data)))
+			return -EFAULT;
+
+		grp_data.s_blocks_per_group =
+			EXT4_BLOCKS_PER_GROUP(inode->i_sb);
+		grp_data.s_inodes_per_group =
+			EXT4_INODES_PER_GROUP(inode->i_sb);
+
+		if (copy_to_user((struct ext4_group_data_info __user *)arg,
+			&grp_data, sizeof(grp_data)))
+			return -EFAULT;
+	} else if (cmd == EXT4_IOC_FREE_BLOCKS_INFO) {
+		struct ext4_extents_info ext_info;
+
+		if (copy_from_user(&ext_info,
+			(struct ext4_extents_info __user *)arg,
+			sizeof(ext_info)))
+			return -EFAULT;
+
+		BUG_ON(ext_info.ino != inode->i_ino);
+
+		err = ext4_defrag_fblocks_distribution(inode, &ext_info);
+
+		if (!err)
+			err = copy_to_user(
+				(struct ext4_extents_info __user *)arg,
+				&ext_info, sizeof(ext_info));
+	} else if (cmd == EXT4_IOC_EXTENTS_INFO) {
+		struct ext4_extents_info ext_info;
+
+		if (copy_from_user(&ext_info,
+				(struct ext4_extents_info __user *)arg,
+				sizeof(ext_info)))
+			return -EFAULT;
+
+		err = ext4_defrag_extents_info(inode->i_sb, &ext_info);
+		if (err >= 0) {
+			if (copy_to_user((struct ext4_extents_info __user *)arg,
+				&ext_info, sizeof(ext_info)))
+				return -EFAULT;
+		}
+	} else if (cmd == EXT4_IOC_RESERVE_BLOCK) {
+		struct ext4_extents_info ext_info;
+
+		if (copy_from_user(&ext_info,
+				(struct ext4_extents_info __user *)arg,
+				sizeof(ext_info)))
+			return -EFAULT;
+
+		err = ext4_defrag_reserve_fblocks(inode, &ext_info);
+	} else if (cmd == EXT4_IOC_MOVE_VICTIM) {
+		struct ext4_extents_info ext_info;
+
+		if (copy_from_user(&ext_info,
+			(struct ext4_extents_info __user *)arg,
+			sizeof(ext_info)))
+			return -EFAULT;
+
+		err = ext4_defrag_move_victim(filp, &ext_info);
+
+	} else if (cmd == EXT4_IOC_BLOCK_RELEASE) {
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ext4_discard_reservation(inode);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	} else if (cmd == EXT4_IOC_DEFRAG) {
+		struct ext4_ext_defrag_data defrag;
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+		if (!capable(CAP_DAC_OVERRIDE)) {
+			if ((inode->i_mode & S_IRUSR) != S_IRUSR)
+				return -EACCES;
+			if (current->fsuid != inode->i_uid)
+				return -EACCES;
+		}
+
+		if (copy_from_user(&defrag,
+			(struct ext4_ext_defrag_data __user *)arg,
+						sizeof(defrag)))
+			return -EFAULT;
+
+		/* Check goal offset if goal offset was given from userspace */
+		if (defrag.goal != -1 &&
+				ext4_blocks_count(es) <= defrag.goal) {
+			printk(KERN_ERR "ext4 defrag: Invalid goal offset"
+				" %llu, you can set goal offset up to %llu\n",
+				defrag.goal, ext4_blocks_count(es) - 1);
+			return -EINVAL;
+		}
+
+		err = ext4_defrag(filp, defrag.start_offset,
+				defrag.defrag_size, defrag.goal, defrag.flag,
+				&defrag.ext);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_merge_across_blocks - Merge extents across leaf block
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @o_start:		first original extent to be defraged
+ * @o_end:		last original extent to be defraged
+ * @start_ext:		first new extent to be merged
+ * @new_ext:		middle of new extent to be merged
+ * @end_ext:		last new extent to be merged
+ * @phase:		phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, int phase)
+{
+	struct ext4_ext_path *org_path = NULL;
+	ext4_lblk_t eblock = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+	int defrag_flag;
+	int err;
+
+	if (phase == DEFRAG_FORCE_VICTIM)
+		defrag_flag = 1;
+	else
+		defrag_flag = 0;
+
+	if (le16_to_cpu(start_ext->ee_len) &&
+		le16_to_cpu(new_ext->ee_len) &&
+		le16_to_cpu(end_ext->ee_len)) {
+
+		if (o_start == o_end) {
+
+			/*       start_ext   new_ext    end_ext
+			 * dest |---------|-----------|--------|
+			 * org  |------------------------------|
+			 */
+
+			end_flag = 1;
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * dest |---------|----------|---------|
+			 * org  |---------------|--------------|
+			 */
+
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (le16_to_cpu(start_ext->ee_len) &&
+			le16_to_cpu(new_ext->ee_len) &&
+			!le16_to_cpu(end_ext->ee_len) &&
+			o_start == o_end) {
+
+		/*	 start_ext	new_ext
+		 * dest |--------------|---------------|
+		 * org  |------------------------------|
+		 */
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (!le16_to_cpu(start_ext->ee_len) &&
+			le16_to_cpu(new_ext->ee_len) &&
+			le16_to_cpu(end_ext->ee_len) &&
+			o_start == o_end) {
+
+		/*	  new_ext	end_ext
+		 * dest |--------------|---------------|
+		 * org  |------------------------------|
+		 */
+
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+		/*
+		 * Set 0 to the extent block if new_ext was
+		 * the first block.
+		 */
+		if (!new_ext->ee_block)
+			eblock = 0;
+		else
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		printk(KERN_ERR "ext4 defrag: Unexpected merge case\n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		org_path = ext4_ext_find_extent(org_inode, eblock, NULL);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto out;
+		}
+		err = ext4_ext_insert_extent_defrag(handle, org_inode,
+					org_path, new_ext, defrag_flag);
+		if (err)
+			goto out;
+	}
+
+	if (end_flag) {
+		org_path = ext4_ext_find_extent(org_inode,
+				le32_to_cpu(end_ext->ee_block) - 1, org_path);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto out;
+		}
+		err = ext4_ext_insert_extent_defrag(handle, org_inode,
+					org_path, end_ext, defrag_flag);
+		if (err)
+			goto out;
+	}
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * ext4_defrag_merge_inside_block - Merge new extent to the extent block
+ *
+ * @o_start:		first original extent to be merged
+ * @o_end:		last original extent to be merged
+ * @start_ext:		first new extent to be merged
+ * @new_ext:		middle of new extent to be merged
+ * @end_ext:		last new extent to be merged
+ * @eh:			extent header of target leaf block
+ * @replaced:		the number of blocks which will be replaced with new_ext
+ * @range_to_move:	used to decide how to merge
+ *
+ * This function always returns 0.
+ */
+static int
+ext4_defrag_merge_inside_block(struct ext4_extent *o_start,
+		struct ext4_extent *o_end, struct ext4_extent *start_ext,
+		struct ext4_extent *new_ext, struct ext4_extent *end_ext,
+		struct ext4_extent_header *eh, ext4_fsblk_t replaced,
+		int range_to_move)
+{
+	int i = 0;
+	unsigned len;
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+			(unsigned long)(o_end + 1);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (le16_to_cpu(start_ext->ee_len))
+		o_start[i++].ee_len = start_ext->ee_len;
+
+	/* Insert new entry */
+	if (le16_to_cpu(new_ext->ee_len)) {
+		o_start[i].ee_block = new_ext->ee_block;
+		o_start[i].ee_len = cpu_to_le16(replaced);
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len)
+		o_start[i] = *end_ext;
+
+	/* Increment the total entries counter on the extent block */
+	le16_add_cpu(&eh->eh_entries, range_to_move);
+
+	return 0;
+}
+
+/**
+ * ext4_defrag_merge_extents - Merge new extent
+ *
+ * @handle:	journal handle
+ * @org_inode:	original inode
+ * @org_path:	path indicates first extent to be defraged
+ * @o_start:	first original extent to be defraged
+ * @o_end:	last original extent to be defraged
+ * @start_ext:	first new extent to be merged
+ * @new_ext:	middle of new extent to be merged
+ * @end_ext:	last new extent to be merged
+ * @replaced:	the number of blocks which will be replaced with new_ext
+ * @phase:	phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, ext4_fsblk_t replaced, int phase)
+{
+	struct  ext4_extent_header *eh;
+	unsigned need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/*
+	 * The extents need to be inserted
+	 * start_extent + new_extent + end_extent.
+	 */
+	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+					/ sizeof(struct ext4_extent);
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = org_path->p_depth;
+	org_path += depth;
+	eh = org_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		ret = ext4_journal_get_write_access(handle, org_path->p_bh);
+		if (ret)
+			return ret;
+	}
+
+	/* Expansion */
+	if (range_to_move > 0 &&
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		ret = ext4_defrag_merge_across_blocks(handle, org_inode,
+					o_start, o_end, start_ext, new_ext,
+					end_ext, phase);
+		if (ret < 0)
+			return ret;
+	} else {
+		ret = ext4_defrag_merge_inside_block(o_start, o_end,
+					start_ext, new_ext, end_ext, eh,
+					replaced, range_to_move);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (depth) {
+		ret = ext4_journal_dirty_metadata(handle, org_path->p_bh);
+		if (ret)
+			return ret;
+	} else {
+		ret = ext4_mark_inode_dirty(handle, org_inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+
+}
+
+/**
+ * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @org_path:		path indicates first extent to be defraged
+ * @dext:		destination extent
+ * @from:		start offset on the target file
+ * @phase:		phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		ext4_lblk_t *from, int phase)
+{
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_fsblk_t replaced = 0;
+	ext4_lblk_t new_end, lblock;
+	unsigned long depth;
+	unsigned short len;
+	ext4_fsblk_t new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	len = le16_to_cpu(new_ext.ee_len);
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/*
+	 * First original extent
+	 * dest	 |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext - 1;
+		if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
+				 == ext_pblock(&new_ext))
+		 && (le32_to_cpu(prev_ext->ee_block)
+			+ le16_to_cpu(prev_ext->ee_len)
+				 == le32_to_cpu(new_ext.ee_block))) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) - 1)
+			replaced += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Last original extent
+		 * dest |----------------|
+		 * org	  |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/*
+		 * Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len. Then merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			ret = ext4_defrag_merge_extents(handle, org_inode,
+					org_path, o_start, o_end, &start_ext,
+					&new_ext, &end_ext, replaced, phase);
+			if (ret < 0)
+				return ret;
+
+			/* All expected blocks are replaced */
+			if (le16_to_cpu(new_ext.ee_len) <= 0)
+				return 0;
+
+			/* Re-calculate new_ext */
+			le16_add_cpu(&new_ext.ee_len, -replaced);
+			le32_add_cpu(&new_ext.ee_block, replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					+ replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced. */
+			if (le16_to_cpu(new_ext.ee_len) <= 0)
+				return 0;
+		}
+
+		/* Get the next extent for original. */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			return ret;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= lblock)
+			return -ENOENT;
+
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_defrag_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @dest_inode:		temporary inode
+ * @from_page:		page offset of org_inode
+ * @dest_from_page:	page offset of dest_inode
+ * @count_page:		page count to be replaced
+ * @phase:              phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from + count - 1".
+ */
+static int
+ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
+			struct inode *dest_inode, pgoff_t from_page,
+			pgoff_t dest_from_page, pgoff_t count_page, int phase)
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent *oext, *dext, *swap_ext;
+	struct ext4_extent tmp_ext, tmp_ext2;
+	ext4_lblk_t from, count, dest_off, diff, org_diff;
+	int err = 0;
+	int depth;
+	int replaced_count = 0;
+
+	from = (ext4_lblk_t)from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = (ext4_lblk_t)count_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = (ext4_lblk_t)dest_from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < le16_to_cpu(tmp_ext.ee_len))
+		tmp_ext.ee_len = cpu_to_le16(count);
+	dext = &tmp_ext;
+
+	depth = ext_depth(org_inode);
+	oext = org_path[depth].p_ext;
+	org_diff = from - le32_to_cpu(oext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+	tmp_ext2.ee_block = tmp_ext.ee_block;
+
+	/* Adjust extent length when blocksize != pagesize */
+	if (le16_to_cpu(tmp_ext.ee_len) <=
+		le16_to_cpu(oext->ee_len) - org_diff) {
+		tmp_ext2.ee_len = tmp_ext.ee_len;
+	} else {
+		tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+						- org_diff);
+		tmp_ext.ee_len = tmp_ext2.ee_len;
+	}
+	swap_ext = &tmp_ext2;
+
+	/* Loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* Loop for the original extent blocks */
+		err = ext4_defrag_leaf_block(handle, org_inode,
+						org_path, dext, &from, phase);
+		if (err < 0)
+			goto out;
+
+		/*
+		 * We need the function which fixes extent information for
+		 * inserting.
+		 * e.g. ext4_defrag_merge_extents()
+		 */
+		err = ext4_defrag_leaf_block(handle, dest_inode,
+					dest_path, swap_ext, &dest_off, -1);
+		if (err < 0)
+			goto out;
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, from, NULL);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= from) {
+			err = 0;
+			goto out;
+		}
+
+		if (dest_path)
+			ext4_ext_drop_refs(dest_path);
+		dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+		if (IS_ERR(dest_path)) {
+			err = PTR_ERR(dest_path);
+			dest_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(dest_inode);
+		dext = dest_path[depth].p_ext;
+		if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
+			<= dest_off) {
+			err = 0;
+			goto out;
+		}
+
+		/* When dext is too large, pick up the target range. */
+		diff = dest_off - le32_to_cpu(dext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+		tmp_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+		tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+
+		if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
+			tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
+
+		dext = &tmp_ext;
+
+		org_diff = from - le32_to_cpu(oext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+		tmp_ext2.ee_block = tmp_ext.ee_block;
+
+		/* Adjust extent length when blocksize != pagesize */
+		if (le16_to_cpu(tmp_ext.ee_len) <=
+			le16_to_cpu(oext->ee_len) - org_diff) {
+			tmp_ext2.ee_len = tmp_ext.ee_len;
+		} else {
+			tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+							- org_diff);
+			tmp_ext.ee_len = tmp_ext2.ee_len;
+		}
+		swap_ext = &tmp_ext2;
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
+ *
+ * @org_inode:		original inode
+ * @dest_inode:		temporary inode
+ * @ar:			allocation request for multiple block allocation
+ * @org_path:		indicating the original inode's extent
+ * @dest_path:		indicating the temporary inode's extent
+ * @req_blocks:		contiguous blocks count we need
+ * @iblock:		target file offset
+ * @goal:		goal offset
+ * @phase:              phase of the force defrag mode
+ *
+ */
+static void
+ext4_defrag_fill_ar(struct inode *org_inode, struct inode *dest_inode,
+			struct ext4_allocation_request *ar,
+			struct ext4_ext_path *org_path,
+			struct ext4_ext_path *dest_path,
+			ext4_fsblk_t req_blocks, ext4_lblk_t iblock,
+			ext4_fsblk_t goal, int phase)
+{
+	ext4_group_t org_grp_no;
+	ext4_grpblk_t org_blk_off;
+	int org_depth = ext_depth(org_inode);
+
+	if (phase == DEFRAG_FORCE_VICTIM) {
+		ext4_get_group_no_and_offset(org_inode->i_sb,
+				ext_pblock(org_path[org_depth].p_ext),
+				&org_grp_no, &org_blk_off);
+		ar->excepted_group = org_grp_no;
+	} else {
+		/* Allocate contiguous blocks to any block group */
+		ar->excepted_group = -1;
+	}
+
+	ar->inode = dest_inode;
+	ar->len = req_blocks;
+	ar->logical = iblock;
+	ar->flags = EXT4_MB_HINT_DATA | EXT4_MB_HINT_RESERVED
+		| EXT4_MB_HINT_NOPREALLOC;
+	ar->lleft = 0;
+	ar->pleft = 0;
+	ar->lright = 0;
+	ar->pright = 0;
+
+	if (goal)
+		ar->goal = goal;
+	else
+		ar->goal = ext4_ext_find_goal(dest_inode, dest_path, iblock);
+}
+
+/**
+ * ext4_defrag_alloc_blocks - Allocate contiguous blocks to temporary inode
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @dest_inode:		temporary inode for multiple block allocation
+ * @ar:			allocation request for multiple block allocation
+ * @dest_path:		indicating the temporary inode's extent
+ * @newblock:		start offset of contiguous blocks
+ *
+ * This function returns 0 if succeeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_alloc_blocks(handle_t *handle, struct inode *org_inode,
+		struct inode *dest_inode, struct ext4_allocation_request *ar,
+		struct ext4_ext_path *dest_path, ext4_fsblk_t *newblock)
+{
+	struct super_block *sb = org_inode->i_sb;
+	struct buffer_head *bh = NULL;
+	int err, i, credits = 0;
+
+	credits = ext4_ext_calc_credits_for_insert(dest_inode, dest_path);
+	err = ext4_ext_journal_restart(handle,
+				credits + EXT4_TRANS_META_BLOCKS);
+	if (err)
+		return err;
+
+	*newblock = ext4_mb_new_blocks(handle, ar, &err);
+	if (err)
+		return err;
+
+	/*
+	 * Dirty buffer_head causes the overwriting
+	 * if ext4_mb_new_blocks() allocates the block
+	 * which used to be the metadata block.
+	 * We should call unmap_underlying_metadata()
+	 * to clear the dirty flag.
+	 */
+	for (i = 0; i < ar->len; i++) {
+		bh = sb_find_get_block(sb, *newblock + i);
+		unmap_underlying_metadata(sb->s_bdev, *newblock + i);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_check_phase
+ * 	- Check condition of the allocated blocks (only force defrag mode)
+ *
+ * @ar:			allocation request for multiple block allocation
+ * @dest_grp_no:	block group num of the allocated blocks
+ * @goal_grp_no:	block group num of the destination of block allocation
+ * @alloc_total:	sum total of the allocated blocks
+ * @req_blocks:		contiguous blocks count we need
+ * @phase:              phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check_phase(struct ext4_allocation_request *ar,
+			ext4_group_t dest_grp_no, ext4_group_t goal_grp_no,
+			ext4_fsblk_t alloc_total, ext4_lblk_t req_blocks,
+			int phase)
+{
+	int err = 0;
+
+	switch (phase) {
+	case DEFRAG_FORCE_TRY:
+		/* If there is not enough space, return -ENOSPC. */
+		if (ar->len != req_blocks)
+			/* -ENOSPC triggers DEFRAG_FORCE_VICTIM phase. */
+			err = -ENOSPC;
+		break;
+	case DEFRAG_FORCE_VICTIM:
+		/* We can't allocate new blocks in the same block group. */
+		if (dest_grp_no == ar->excepted_group) {
+			printk(KERN_ERR "ext4 defrag: Failed to allocate"
+					" victim file to other block group\n");
+			err = -ENOSPC;
+		}
+		break;
+	case DEFRAG_FORCE_GATHER:
+		/* Maybe reserved blocks are already used by other process. */
+		if (dest_grp_no != goal_grp_no
+		    || alloc_total != req_blocks) {
+			printk(KERN_ERR "ext4 defrag: Reserved blocks are"
+					" already used by other process\n");
+			err = -EIO;
+		}
+		break;
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_partial - Defrag a file per page
+ *
+ * @tmp_inode:		temporary inode
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ * @phase:		phase of the force defrag mode
+ *
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
+			pgoff_t org_offset, pgoff_t dest_offset, int phase)
+{
+	struct inode *org_inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = org_inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret, i, jblocks, blocks_per_page;
+	int blocksize = org_inode->i_sb->s_blocksize;
+	long long offs = org_offset << PAGE_CACHE_SHIFT;
+	unsigned long blk_off = 0;
+	unsigned int w_flags = 0;
+	void *fsdata;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and tmp_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
+	handle = ext4_journal_start(org_inode, jblocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	if (org_offset == ((org_inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+		offset_in_page = (org_inode->i_size & (PAGE_CACHE_SIZE - 1));
+		/*
+		 * Set PAGE_CACHE_SIZE to offset_in_page not be 0
+		 * if org_offset is the last page and i_size is
+		 * multiples of PAGE_CACHE_SIZE.
+		 */
+		if (offset_in_page == 0)
+			offset_in_page = PAGE_CACHE_SIZE;
+	}
+
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	ret = a_ops->write_begin(filp, mapping, offs,
+				offset_in_page, w_flags, &page, &fsdata);
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call relasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple defrag processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
+					org_offset, dest_offset, 1, phase);
+
+	if (ret < 0)
+		goto out;
+
+	/* Clear the inode cache not to refer to the old data */
+	ext4_ext_invalidate_cache(org_inode);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
+
+	blocks_per_page = PAGE_SIZE / blocksize;
+	blk_off = org_offset * blocks_per_page;
+
+	bh = page_buffers(page);
+	for (i = 0; i < blocks_per_page; i++) {
+		up_write(&EXT4_I(org_inode)->i_data_sem);
+		ret = ext4_get_block(org_inode, blk_off++, bh, 0);
+		down_write(&EXT4_I(org_inode)->i_data_sem);
+
+		if (ret < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	ret = a_ops->write_end(filp, mapping, offs, offset_in_page,
+				offset_in_page, page, fsdata);
+
+	if (unlikely(ret < 0))
+		goto out;
+out:
+	ext4_journal_stop(handle);
+
+	return (ret < 0 ? ret : 0);
+}
+
+/**
+ * ext4_defrag_comp_ext_count- Check whether fragments are improved or not
+ *
+ * @org_inode:		original inode
+ * @path:		the structure holding some info about
+ *			original extent tree
+ * @tar_end:		the last block number of the allocated blocks
+ * @sum_tmp:		the extents count  in the allocated blocks
+ * @goal:		block offset for allocaton
+ * @phase:		phase of the force defrag mode
+ *
+ *
+ * This function returns the values as below.
+ *	0 (improved)
+ *	1 (not improved)
+ *	negative value (error case)
+ */
+static int
+ext4_defrag_comp_ext_count(struct inode *org_inode,
+			struct ext4_ext_path *org_path, ext4_lblk_t tar_end,
+			int sum_tmp, ext4_fsblk_t goal, int phase)
+{
+	struct ext4_extent *ext = NULL;
+	int depth = ext_depth(org_inode);
+	int last_extent = 0;
+	int sum_org = 0;
+	int ret = 0;
+
+	ext = org_path[depth].p_ext;
+
+	/*
+	 * Compare the number of the newly allocated extents to
+	 * that of existing one.
+	 */
+	while (1) {
+		if (!last_extent)
+			++sum_org;
+		if (tar_end <= (le32_to_cpu(ext->ee_block) +
+			       le16_to_cpu(ext->ee_len) - 1) ||
+			       last_extent) {
+			/*
+			 * Fail if goal is not set and the fragmentation
+			 * is not improved.
+			 */
+			if (sum_org == sum_tmp && !goal) {
+				/* Not improved */
+				ret = 1;
+			} else if (sum_org < sum_tmp &&
+					phase != DEFRAG_FORCE_VICTIM) {
+				/* Fragment increased */
+				ret = -ENOSPC;
+				printk(KERN_ERR "ext4 defrag: "
+					"Insufficient free blocks\n");
+			}
+			break;
+		}
+		last_extent =
+			ext4_defrag_next_extent(org_inode, org_path, &ext);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree
+ *
+ * @org_inode:		original inode
+ * @tmp_inode:		temporary inode
+ * @org_path:		indicating the original inode's extent
+ * @tar_start:		starting offset to allocate in blocks
+ * @tar_blocks:		the number of blocks to allocate
+ * @iblock:		file related offset
+ * @goal:		block offset for allocaton
+ * @phase:		phase of the force defrag mode
+ *
+ *
+ * This function returns the value as below:
+ *	0 (succeed)
+ *	1 (not improved)
+ *	negative value (error case)
+ */
+static int
+ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
+			struct ext4_ext_path *org_path, ext4_lblk_t tar_start,
+			ext4_lblk_t tar_blocks, ext4_lblk_t iblock,
+			ext4_fsblk_t goal, int phase)
+{
+	handle_t *handle;
+	struct ext4_extent_header *eh = NULL;
+	struct ext4_allocation_request ar;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent newex;
+	ext4_fsblk_t alloc_total = 0;
+	ext4_fsblk_t newblock = 0;
+	ext4_lblk_t tar_end = tar_start + tar_blocks - 1;
+	ext4_group_t dest_group_no, goal_group_no;
+	ext4_grpblk_t dest_blk_off, goal_blk_off;
+	int sum_tmp = 0;
+	int metadata = 1;
+	int ret, ret2;
+
+	eh = ext_inode_hdr(tmp_inode);
+	eh->eh_depth = 0;
+
+	dest_path = ext4_ext_find_extent(tmp_inode, iblock, NULL);
+	if (IS_ERR(dest_path)) {
+		ret = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out2;
+	}
+
+	/* Fill struct ext4_allocation_request with necessary info */
+	ext4_defrag_fill_ar(org_inode, tmp_inode, &ar, org_path,
+				dest_path, tar_blocks, iblock, goal, phase);
+
+	handle = ext4_journal_start(tmp_inode, 0);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out2;
+	}
+
+	ext4_get_group_no_and_offset(tmp_inode->i_sb, goal,
+				&goal_group_no, &goal_blk_off);
+
+	while (alloc_total != tar_blocks) {
+		/* Allocate blocks */
+		ret = ext4_defrag_alloc_blocks(handle, org_inode, tmp_inode,
+						&ar, dest_path, &newblock);
+		if (ret < 0)
+			goto out;
+
+		ext4_get_group_no_and_offset(tmp_inode->i_sb, newblock,
+					&dest_group_no, &dest_blk_off);
+
+		alloc_total += ar.len;
+
+		/* the checks that done in force mode */
+		if (phase) {
+			ret = ext4_defrag_check_phase(&ar, dest_group_no,
+					goal_group_no, alloc_total,
+					tar_blocks, phase);
+			if (ret < 0)
+				goto out;
+		}
+
+		newex.ee_block = cpu_to_le32(alloc_total - ar.len);
+		ext4_ext_store_pblock(&newex, newblock);
+		newex.ee_len = cpu_to_le16(ar.len);
+
+		ret = ext4_ext_insert_extent(handle, tmp_inode,
+						dest_path, &newex);
+		if (ret < 0)
+			goto out;
+
+		if (!phase)
+			ar.goal = newblock + ar.len;
+		ar.len = tar_blocks - alloc_total;
+		sum_tmp++;
+	}
+
+	ret = ext4_defrag_comp_ext_count(org_inode, org_path, tar_end,
+					sum_tmp, goal, phase);
+
+out:
+	if (ret < 0 || ret == 1) {
+		if (ar.len)
+			ext4_free_blocks(handle, tmp_inode, newblock, ar.len,
+					metadata);
+		/* Faild case: We have to remove halfway blocks */
+		ret2 = ext4_ext_remove_space(tmp_inode, 0);
+		if (ret2) {
+			printk(KERN_ERR "ext4 defrag: "
+				"Failed to remove temporary inode blocks\n");
+			ret = ret2;
+		}
+	}
+
+	ext4_journal_stop(handle);
+
+out2:
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return ret;
+}
+
+/**
+ * ext4_defrag_check - Check the enviroment whether a defrag can be done
+ *
+ * @org_inode:		original inode
+ * @ext:		extent to be moved (only defrag force mode)
+ * @defrag_size:	size of defrag in blocks
+ * @goal:		poiter to block offset for allocation
+ * @phase:		phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check(struct inode *org_inode, struct ext4_extent_data *ext,
+		ext4_lblk_t defrag_size, ext4_fsblk_t *goal, int *phase)
+{
+
+	/* ext4 online defrag supports only 4KB block size */
+	if (org_inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) {
+		printk(KERN_ERR "ext4 defrag: ext4 online defrag supports "
+				"only 4KB block size for the moment.\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* ext4 online defrag needs mballoc mount option. */
+	if (!test_opt(org_inode->i_sb, MBALLOC)) {
+		printk(KERN_ERR "ext4 defrag: multiblock allocation "
+				"is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ext->len) {
+		/* Setup for the force defrag mode */
+		if (ext->len < defrag_size) {
+			printk(KERN_ERR "ext4 defrag: "
+					"Invalid length of extent\n");
+			return -EINVAL;
+		}
+		*phase = DEFRAG_FORCE_GATHER;
+		*goal = ext->start;
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_defrag_init_tmp_inode - Create a temporary inode
+ *
+ * @org_inode:		original inode
+ *
+ * This function returns pointer to the struct inode if succeed,
+ * otherwise returns error value.
+ */
+static struct inode *
+ext4_defrag_init_tmp_inode(struct inode *org_inode)
+{
+	handle_t *handle;
+	struct inode *tmp_inode;
+
+	handle = ext4_journal_start(org_inode,
+		EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) +
+		EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 +
+		2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb));
+	if (IS_ERR(handle))
+		/* Return error code */
+		return (struct inode *)handle;
+
+	tmp_inode = ext4_new_inode(handle,
+		org_inode->i_sb->s_root->d_inode, S_IFREG);
+	if (IS_ERR(tmp_inode))
+		goto out;
+
+	i_size_write(tmp_inode, i_size_read(org_inode));
+	tmp_inode->i_nlink = 0;
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+
+out:
+	ext4_journal_stop(handle);
+
+	return tmp_inode;
+}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * If no-option is specified, ext4_defrag() proceeds the following order.
+ * 1.ext4_defrag() calculates the block number where defrag terminates
+ *   by the start block number(defrag_start) and the size of defraged data
+ *   (defrag_size) specified as arguments.
+ *   If the defrag_start points a hole, the extent's start offset pointed by
+ *   ext_cur(current extent), holecheck_path, org_path are set after
+ *   hole behind.
+ * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3.To get a length of continues area, call ext4_defrag_next_extent()
+ *   specified with the ext_cur(initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4.After determining the length of continuous block,
+ *   allocates continuous blocks to a temporary inode
+ *   by ext4_defrag_new_extent_tree().
+ * 5.Exchange the original inode data with temporary inode data
+ *   from page_offset to seq_end_page by page unit.
+ *   The start page index of data are specified as arguments:
+ *   the original inode is page_offset, the temporary inode is dest_offset.
+ * 6.Update holecheck_path and org_path to points a next proceeding extent,
+ *   and release the temporary inode holding the original fragmented data.
+ *   Then, returns to step 2.
+ * 7.Release holecheck_path, org_path and temporary inode,
+ *    and returns the defrag_size which is the size of defraged data.
+ *    The defrag_size is used for the command to calculate the file offset
+ *    where a next defrag processing start.
+ *    (Since the defrag command calls defrag_ioctl() by 64MB unit,
+ *     a file bigger than 64MB calls defrag_ioctl many times.)
+ *
+ * @filp:		pointer to file
+ * @block_start:	starting offset to defrag in blocks
+ * @defrag_size:	size of defrag in blocks
+ * @goal:		block offset for allocation
+ * @phase:		phase of the force defrag mode
+ * @ext:		extent to be moved (only defrag force mode)
+ *
+ * This function returns the number of blocks if succeed, otherwise
+ * returns error value.
+ */
+int
+ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+		ext4_lblk_t defrag_size, ext4_fsblk_t goal, int phase,
+		struct ext4_extent_data *ext)
+{
+	struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
+	struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	pgoff_t page_offset, seq_end_page, dest_offset;
+	int ret, depth, seq_extents, last_extent = 0;
+
+	/* Check the filesystem enviroment whether defrag can be done */
+	ret = ext4_defrag_check(org_inode, ext, defrag_size, &goal, &phase);
+	if (ret < 0)
+		return ret;
+
+	file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits;
+	block_end = block_start + defrag_size - 1;
+	if (file_end < block_end)
+		defrag_size -= block_end - file_end;
+
+	mutex_lock(&org_inode->i_mutex);
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(org_path)) {
+		ret = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get path structure to check the hole */
+	holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(holecheck_path)) {
+		ret = PTR_ERR(holecheck_path);
+		holecheck_path = NULL;
+		goto out;
+	}
+
+	depth = ext_depth(org_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+	if (ext_cur == NULL)
+		goto out;
+
+	/*
+	 * Get proper extent whose ee_block is beyond block_start
+	 * if block_start was within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = ext4_defrag_next_extent(org_inode, org_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+	}
+	seq_extents = 1;
+	seq_start = le32_to_cpu(ext_cur->ee_block);
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		printk(KERN_INFO "ext4 defrag: The specified range of file"
+				" may be the hole\n");
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 le16_to_cpu(ext_cur->ee_len), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Create a temporary inode to be exchanged data block */
+		tmp_inode = ext4_defrag_init_tmp_inode(org_inode);
+		if (IS_ERR(tmp_inode)) {
+			ret = PTR_ERR(tmp_inode);
+			tmp_inode = NULL;
+			goto out;
+		}
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		if (!last_extent)
+			seq_extents++;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (le32_to_cpu(ext_prev->ee_block) +
+				le16_to_cpu(ext_prev->ee_len) ==
+				le32_to_cpu(ext_cur->ee_block) &&
+				block_end >= le32_to_cpu(ext_cur->ee_block) &&
+				!last_extent) {
+			if (tmp_inode) {
+				iput(tmp_inode);
+				tmp_inode = NULL;
+			}
+			continue;
+		}
+
+		/* Found an isolated block */
+		if (seq_extents == 1 && !goal) {
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
+					org_path, seq_start, seq_blocks,
+					block_start, goal, phase);
+
+		if (ret < 0) {
+			break;
+		} else if (ret == 1 && (!goal || (goal && !phase))) {
+			ret = 0;
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		dest_offset = 0;
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+
+		/*
+		 * Discard all preallocations.
+		 * This is provisional solution.
+		 * When true ext4_mb_return_to_preallocation() is
+		 * implemented, this will be removed.
+		 */
+		ext4_mb_discard_inode_preallocations(org_inode);
+
+		while (page_offset <= seq_end_page) {
+			/* Swap original branches with new branches */
+			ret = ext4_defrag_partial(tmp_inode, filp,
+					page_offset, dest_offset, phase);
+			if (ret < 0)
+				goto out;
+
+			page_offset++;
+			dest_offset++;
+		}
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		holecheck_path = ext4_ext_find_extent(org_inode,
+						seq_start, holecheck_path);
+		if (IS_ERR(holecheck_path)) {
+			ret = PTR_ERR(holecheck_path);
+			holecheck_path = NULL;
+			break;
+		}
+		depth = holecheck_path->p_depth;
+
+CLEANUP:
+		/* Decrease buffer counter */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			break;
+		}
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+		seq_blocks = 0;
+		dest_offset = 0;
+		seq_extents = 1;
+
+		if (tmp_inode) {
+			iput(tmp_inode);
+			tmp_inode = NULL;
+		}
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+
+	if (phase == DEFRAG_FORCE_GATHER)
+		/* Release reserved block in force mode */
+		ext4_discard_reservation(org_inode);
+
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	mutex_unlock(&org_inode->i_mutex);
+
+	if (tmp_inode)
+		iput(tmp_inode);
+
+	return (ret ? ret : defrag_size);
+}
Index: linux-2.6/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.orig/fs/ext4/ioctl.c	2008-06-05 13:44:20.526046257 -0500
+++ linux-2.6/fs/ext4/ioctl.c	2008-07-05 12:51:24.553290999 -0500
@@ -241,6 +241,16 @@ setversion_out:
 
 		return err;
 	}
+	case EXT4_IOC_FIBMAP:
+	case EXT4_IOC_DEFRAG:
+	case EXT4_IOC_GROUP_INFO:
+	case EXT4_IOC_FREE_BLOCKS_INFO:
+	case EXT4_IOC_EXTENTS_INFO:
+	case EXT4_IOC_RESERVE_BLOCK:
+	case EXT4_IOC_MOVE_VICTIM:
+	case EXT4_IOC_BLOCK_RELEASE: {
+		return ext4_defrag_ioctl(inode, filp, cmd, arg);
+	}
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
Index: linux-2.6/fs/ext4/mballoc.h
===================================================================
--- linux-2.6.orig/fs/ext4/mballoc.h	2008-06-05 13:44:20.529046073 -0500
+++ linux-2.6/fs/ext4/mballoc.h	2008-07-05 12:51:24.579300477 -0500
@@ -205,6 +205,7 @@ struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	long long ac_excepted_group;
 };
 
 #define AC_STATUS_CONTINUE	1
Index: linux-2.6/include/linux/fiemap.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/fiemap.h	2008-07-05 12:51:24.595291345 -0500
@@ -0,0 +1,49 @@
+/*
+ * FIEMAP ioctl infrastructure.
+ *
+ * Copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ *	   Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+struct fiemap_extent {
+	__u64	fe_offset; /* offset in bytes for the start of the extent */
+	__u64	fe_length; /* length in bytes for the extent */
+	__u32	fe_flags;  /* returned FIEMAP_EXTENT_* flags for the extent */
+	__u32	fe_lun;	   /* logical device number for extent (starting at 0)*/
+};
+
+struct fiemap {
+	__u64	fm_start;	 /* logical starting byte offset (in/out) */
+	__u64	fm_length;	 /* logical length of map (in/out) */
+	__u32	fm_flags;	 /* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32	fm_extent_count; /* number of extents in fm_extents (in/out) */
+	__u64	fm_end_offset;	 /* logical offset of end of mapping in last ioctl (out) */
+	struct fiemap_extent	fm_extents[0];
+};
+
+#define FIEMAP_FLAG_SYNC	0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_HSM_READ	0x00000002 /* get data from HSM before map */
+#define FIEMAP_FLAG_NUM_EXTENTS	0x00000004 /* return only number of extents */
+#define FIEMAP_FLAG_INCOMPAT	0xff000000 /* error for unknown flags in here */
+
+#define FIEMAP_FLAG_LUN_OFFSET	0x01000000 /* use lun offsets, instead of
+					    * logical file offsets */
+
+#define FIEMAP_EXTENT_HOLE	0x00000001 /* has no data or space allocation */
+#define FIEMAP_EXTENT_UNWRITTEN	0x00000002 /* space allocated, but no data */
+#define FIEMAP_EXTENT_UNKNOWN	0x00000004 /* in use, location unknown */
+#define FIEMAP_EXTENT_ERROR	0x00000008 /* mapping error, errno in fe_start*/
+#define FIEMAP_EXTENT_NO_DIRECT	0x00000010 /* cannot access data directly */
+#define FIEMAP_EXTENT_LAST	0x00000020 /* last extent in the file */
+#define FIEMAP_EXTENT_DELALLOC	0x00000040 /* has data but not yet written,
+					    * must have EXTENT_UNKNOWN set */
+#define FIEMAP_EXTENT_SECONDARY	0x00000080 /* data (also) in secondary storage,
+					    * not in primary if EXTENT_UNKNOWN*/
+#define FIEMAP_EXTENT_EOF	0x00000100 /* if fm_start+fm_len is beyond EOF*/
+
+#endif /* _LINUX_FIEMAP_H */
Index: linux-2.6/fs/ioctl.c
===================================================================
--- linux-2.6.orig/fs/ioctl.c	2008-06-05 13:44:21.026983204 -0500
+++ linux-2.6/fs/ioctl.c	2008-07-05 12:51:24.629290501 -0500
@@ -71,6 +71,34 @@ static int ioctl_fibmap(struct file *fil
 	return put_user(res, p);
 }
 
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap_s;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int error = 0;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+        if (copy_from_user(&fiemap_s, (struct fiemap __user *)arg,
+                           sizeof(struct fiemap)))
+                return -EFAULT;
+
+	/* Need arg sanity checking:
+	 * start >= 0?  Must be; unsigned.
+	 * length > 0? (or is -1 valid?)
+	 * extent count non-zero if not FLAG_NUM_EXTENTS
+	 */
+
+	/* Should fs do this under a lock? */
+	if (fiemap_s.fm_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, arg);
+
+	return error;
+}
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -80,6 +108,8 @@ static int file_ioctl(struct file *filp,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD: