Merge branch 'i2c-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelvar...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Dec 2012 01:41:26 +0000 (17:41 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Dec 2012 01:41:26 +0000 (17:41 -0800)
Pull i2c update from Jean Delvare:
 "This is my last pull request for the i2c subsystem.  It includes all
  the patches I collected between kernel v3.7-rc1 and me passing i2c
  maintenance duties over to Wolfram.

  Future patches to the many i2c bus drivers I still maintain will go
  through Wolfram's tree."

* 'i2c-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelvare/staging:
  i2c: Mention functionality flags in SMBus protocol documentation
  i2c-piix4: Convert dev_printk(KERN_<LEVEL> to dev_<level>(
  i2c-i801: Enable interrupts for all post-ICH5 chips
  i2c-i801: Add device tree support
  MAINTAINERS: Fix drivers/i2c/busses/i2c-stub.c

105 files changed:
Documentation/filesystems/ext4.txt
Documentation/kernel-parameters.txt
Documentation/prctl/seccomp_filter.txt
Documentation/security/keys.txt
arch/sh/mm/Kconfig
arch/x86/Kconfig
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/kernel/vsyscall_64.c
arch/x86/mm/pgtable.c
drivers/bus/Kconfig
drivers/char/tpm/tpm_ibmvtpm.c
drivers/char/tpm/tpm_ibmvtpm.h
drivers/input/keyboard/Kconfig
drivers/usb/phy/Kconfig
drivers/video/omap2/Kconfig
drivers/w1/masters/Kconfig
drivers/xen/swiotlb-xen.c
fs/Kconfig
fs/cifs/cifsacl.c
fs/ext4/Kconfig
fs/ext4/Makefile
fs/ext4/acl.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/extents_status.c [new file with mode: 0644]
fs/ext4/extents_status.h [new file with mode: 0644]
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inline.c [new file with mode: 0644]
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/migrate.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/symlink.c
fs/ext4/xattr.c
fs/ext4/xattr.h
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/nfs/idmap.c
include/asm-generic/pgtable.h
include/linux/cred.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/jbd2.h
include/linux/key.h
include/linux/mempolicy.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swiotlb.h
include/linux/vm_event_item.h
include/linux/vmstat.h
include/trace/events/ext4.h
include/trace/events/migrate.h [new file with mode: 0644]
include/uapi/linux/mempolicy.h
init/Kconfig
kernel/cred.c
kernel/fork.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sched/sched.h
kernel/seccomp.c
kernel/sysctl.c
lib/swiotlb.c
mm/compaction.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/ksm.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/page_alloc.c
mm/pgtable-generic.c
mm/rmap.c
mm/vmstat.c
net/dns_resolver/dns_key.c
security/keys/key.c
security/keys/keyctl.c
security/keys/keyring.c
security/keys/process_keys.c
security/keys/request_key.c
security/smack/Kconfig
security/smack/smackfs.c
security/yama/yama_lsm.c

index 104322bf378c314061ae906433e0e81a9abde7e0..34ea4f1fa6ea7eefd359fb09e1605a6d6948910b 100644 (file)
@@ -200,12 +200,9 @@ inode_readahead_blks=n     This tuning parameter controls the maximum
                        table readahead algorithm will pre-read into
                        the buffer cache.  The default value is 32 blocks.
 
-nouser_xattr           Disables Extended User Attributes. If you have extended
-                       attribute support enabled in the kernel configuration
-                       (CONFIG_EXT4_FS_XATTR), extended attribute support
-                       is enabled by default on mount. See the attr(5) manual
-                       page and http://acl.bestbits.at/ for more information
-                       about extended attributes.
+nouser_xattr           Disables Extended User Attributes.  See the
+                       attr(5) manual page and http://acl.bestbits.at/
+                       for more information about extended attributes.
 
 noacl                  This option disables POSIX Access Control List
                        support. If ACL support is enabled in the kernel
index 20e248cc03a9a64f08f55af30d10f8fe328c6861..ea8e5b48557674a3fc7725234237798aeef68a04 100644 (file)
@@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
 
+       numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing.
+                       Allowed values are enable and disable
+
        numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
                        one of ['zone', 'node', 'default'] can be specified
                        This can be set from sysctl after boot.
index 597c3c5813751ac54629cc690e1137b9f339dd66..1e469ef7577835ae690094ec47262912d899727a 100644 (file)
@@ -95,12 +95,15 @@ SECCOMP_RET_KILL:
 
 SECCOMP_RET_TRAP:
        Results in the kernel sending a SIGSYS signal to the triggering
-       task without executing the system call.  The kernel will
-       rollback the register state to just before the system call
-       entry such that a signal handler in the task will be able to
-       inspect the ucontext_t->uc_mcontext registers and emulate
-       system call success or failure upon return from the signal
-       handler.
+       task without executing the system call.  siginfo->si_call_addr
+       will show the address of the system call instruction, and
+       siginfo->si_syscall and siginfo->si_arch will indicate which
+       syscall was attempted.  The program counter will be as though
+       the syscall happened (i.e. it will not point to the syscall
+       instruction).  The return value register will contain an arch-
+       dependent value -- if resuming execution, set it to something
+       sensible.  (The architecture dependency is because replacing
+       it with -ENOSYS could overwrite some useful information.)
 
        The SECCOMP_RET_DATA portion of the return value will be passed
        as si_errno.
@@ -123,6 +126,18 @@ SECCOMP_RET_TRACE:
        the BPF program return value will be available to the tracer
        via PTRACE_GETEVENTMSG.
 
+       The tracer can skip the system call by changing the syscall number
+       to -1.  Alternatively, the tracer can change the system call
+       requested by changing the system call to a valid syscall number.  If
+       the tracer asks to skip the system call, then the system call will
+       appear to return the value that the tracer puts in the return value
+       register.
+
+       The seccomp check will not be run again after the tracer is
+       notified.  (This means that seccomp-based sandboxes MUST NOT
+       allow use of ptrace, even of other sandboxed processes, without
+       extreme care; ptracers can use this mechanism to escape.)
+
 SECCOMP_RET_ALLOW:
        Results in the system call being executed.
 
@@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to
 support seccomp filter with minor fixup: SIGSYS support and seccomp return
 value checking.  Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
 to its arch-specific Kconfig.
+
+
+
+Caveats
+-------
+
+The vDSO can cause some system calls to run entirely in userspace,
+leading to surprises when you run programs on different machines that
+fall back to real syscalls.  To minimize these surprises on x86, make
+sure you test with
+/sys/devices/system/clocksource/clocksource0/current_clocksource set to
+something like acpi_pm.
+
+On x86-64, vsyscall emulation is enabled by default.  (vsyscalls are
+legacy variants on vDSO calls.)  Currently, emulated vsyscalls will honor seccomp, with a few oddities:
+
+- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
+  the vsyscall entry for the given call and not the address after the
+  'syscall' instruction.  Any code which wants to restart the call
+  should be aware that (a) a ret instruction has been emulated and (b)
+  trying to resume the syscall will again trigger the standard vsyscall
+  emulation security checks, making resuming the syscall mostly
+  pointless.
+
+- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
+  but the syscall may not be changed to another system call using the
+  orig_rax register. It may only be changed to -1 order to skip the
+  currently emulated call. Any other change MAY terminate the process.
+  The rip value seen by the tracer will be the syscall entry address;
+  this is different from normal behavior.  The tracer MUST NOT modify
+  rip or rsp.  (Do not rely on other changes terminating the process.
+  They might work.  For example, on some kernels, choosing a syscall
+  that only exists in future kernels will be correctly emulated (by
+  returning -ENOSYS).
+
+To detect this quirky behavior, check for addr & ~0x0C00 ==
+0xFFFFFFFFFF600000.  (For SECCOMP_RET_TRACE, use rip.  For
+SECCOMP_RET_TRAP, use siginfo->si_call_addr.)  Do not check any other
+condition: future kernels may improve vsyscall emulation and current
+kernels in vsyscall=native mode will behave differently, but the
+instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
+cases.
+
+Note that modern systems are unlikely to use vsyscalls at all -- they
+are a legacy feature and they are considerably slower than standard
+syscalls.  New code will use the vDSO, and vDSO-issued system calls
+are indistinguishable from normal system calls.
index 7d9ca92022d8c97b2f9db88c66713211c9c8cc2a..7b4145d00452f259fe79eff4f400ecfc949a6fd1 100644 (file)
@@ -994,6 +994,23 @@ payload contents" for more information.
     reference pointer if successful.
 
 
+(*) A keyring can be created by:
+
+       struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
+                                 const struct cred *cred,
+                                 key_perm_t perm,
+                                 unsigned long flags,
+                                 struct key *dest);
+
+    This creates a keyring with the given attributes and returns it.  If dest
+    is not NULL, the new keyring will be linked into the keyring to which it
+    points.  No permission checks are made upon the destination keyring.
+
+    Error EDQUOT can be returned if the keyring would overload the quota (pass
+    KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
+    towards the user's quota).  Error ENOMEM can also be returned.
+
+
 (*) To check the validity of a key, this function can be called:
 
        int validate_key(struct key *key);
index cb8f9920f4dd873b69050e81c4a46fc7e40f5007..0f7c852f355c25a77e20badccc65862ff83eb469 100644 (file)
@@ -111,6 +111,7 @@ config VSYSCALL
 config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+       select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
index 65a872bf72f9c693039540519b69f9c8238b7214..97f8c5ad8c2ddff2449da615e46c2faab3b9d067 100644 (file)
@@ -22,6 +22,8 @@ config X86
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+       select ARCH_SUPPORTS_NUMA_BALANCING
+       select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
index a1f780d45f7628a741939ba65e0f711d0fce468b..5199db2923d31ff88b94c54397daae2b279a7bc7 100644 (file)
@@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b)
 
 static inline int pte_present(pte_t a)
 {
-       return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+       return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+                              _PAGE_NUMA);
+}
+
+#define pte_accessible pte_accessible
+static inline int pte_accessible(pte_t a)
+{
+       return pte_flags(a) & _PAGE_PRESENT;
 }
 
 static inline int pte_hidden(pte_t pte)
@@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd)
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
-       return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+       return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+                                _PAGE_NUMA);
 }
 
 static inline int pmd_none(pmd_t pmd)
@@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
+#ifdef CONFIG_NUMA_BALANCING
+       /* pmd_numa check */
+       if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+               return 0;
+#endif
        return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
index ec8a1fc9505d5d180e8ac114f23813fa8f12960b..3c32db8c539d7d037ac93afc679da31e1808c402 100644 (file)
 #define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
+/*
+ * _PAGE_NUMA indicates that this page will trigger a numa hinting
+ * minor page fault to gather numa placement statistics (see
+ * pte_numa()). The bit picked (8) is within the range between
+ * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+ * require changes to the swp entry format because that bit is always
+ * zero when the pte is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ *
+ * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+ * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+ * couldn't reach, like handle_mm_fault() (see access_error in
+ * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+ * handle_mm_fault() to be invoked).
+ */
+#define _PAGE_NUMA     _PAGE_PROTNONE
+
 #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
index 3a3e8c9e280dcac9ba655491b4300c1d698bfb5a..9a907a67be8f48abc0399017865a4697968ae2c3 100644 (file)
@@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
        return nr;
 }
 
-#ifdef CONFIG_SECCOMP
-static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
-{
-       if (!seccomp_mode(&tsk->seccomp))
-               return 0;
-       task_pt_regs(tsk)->orig_ax = syscall_nr;
-       task_pt_regs(tsk)->ax = syscall_nr;
-       return __secure_computing(syscall_nr);
-}
-#else
-#define vsyscall_seccomp(_tsk, _nr) 0
-#endif
-
 static bool write_ok_or_segv(unsigned long ptr, size_t size)
 {
        /*
@@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
        struct task_struct *tsk;
        unsigned long caller;
-       int vsyscall_nr;
+       int vsyscall_nr, syscall_nr, tmp;
        int prev_sig_on_uaccess_error;
        long ret;
-       int skip;
 
        /*
         * No point in checking CS -- the only way to get here is a user mode
@@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
        }
 
        tsk = current;
-       /*
-        * With a real vsyscall, page faults cause SIGSEGV.  We want to
-        * preserve that behavior to make writing exploits harder.
-        */
-       prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
-       current_thread_info()->sig_on_uaccess_error = 1;
 
        /*
+        * Check for access_ok violations and find the syscall nr.
+        *
         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
         * 64-bit, so we don't need to special-case it here.  For all the
         * vsyscalls, NULL means "don't write anything" not "write it at
         * address 0".
         */
-       ret = -EFAULT;
-       skip = 0;
        switch (vsyscall_nr) {
        case 0:
-               skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
-               if (skip)
-                       break;
-
                if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
-                   !write_ok_or_segv(regs->si, sizeof(struct timezone)))
-                       break;
+                   !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_gettimeofday;
+               break;
+
+       case 1:
+               if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_time;
+               break;
+
+       case 2:
+               if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+                   !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_getcpu;
+               break;
+       }
+
+       /*
+        * Handle seccomp.  regs->ip must be the original value.
+        * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+        *
+        * We could optimize the seccomp disabled case, but performance
+        * here doesn't matter.
+        */
+       regs->orig_ax = syscall_nr;
+       regs->ax = -ENOSYS;
+       tmp = secure_computing(syscall_nr);
+       if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+               warn_bad_vsyscall(KERN_DEBUG, regs,
+                                 "seccomp tried to change syscall nr or ip");
+               do_exit(SIGSYS);
+       }
+       if (tmp)
+               goto do_ret;  /* skip requested */
 
+       /*
+        * With a real vsyscall, page faults cause SIGSEGV.  We want to
+        * preserve that behavior to make writing exploits harder.
+        */
+       prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+       current_thread_info()->sig_on_uaccess_error = 1;
+
+       ret = -EFAULT;
+       switch (vsyscall_nr) {
+       case 0:
                ret = sys_gettimeofday(
                        (struct timeval __user *)regs->di,
                        (struct timezone __user *)regs->si);
                break;
 
        case 1:
-               skip = vsyscall_seccomp(tsk, __NR_time);
-               if (skip)
-                       break;
-
-               if (!write_ok_or_segv(regs->di, sizeof(time_t)))
-                       break;
-
                ret = sys_time((time_t __user *)regs->di);
                break;
 
        case 2:
-               skip = vsyscall_seccomp(tsk, __NR_getcpu);
-               if (skip)
-                       break;
-
-               if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
-                   !write_ok_or_segv(regs->si, sizeof(unsigned)))
-                       break;
-
                ret = sys_getcpu((unsigned __user *)regs->di,
                                 (unsigned __user *)regs->si,
                                 NULL);
@@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
        current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 
-       if (skip) {
-               if ((long)regs->ax <= 0L) /* seccomp errno emulation */
-                       goto do_ret;
-               goto done; /* seccomp trace/trap */
-       }
-
+check_fault:
        if (ret == -EFAULT) {
                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
@@ -311,7 +320,6 @@ do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
-done:
        return true;
 
 sigsegv:
index 217eb705fac073188fb18e908b675340834e1f97..e27fbf887f3ba2b0fb41595710a9d8d9882aed1f 100644 (file)
@@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        free_page((unsigned long)pgd);
 }
 
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
 int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
@@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        if (changed && dirty) {
                *ptep = entry;
                pte_update_defer(vma->vm_mm, address, ptep);
-               flush_tlb_page(vma, address);
        }
 
        return changed;
index bbec35d21fe53abf46c3ff8e0e83213b74f4c5b4..0f51ed687dc8442ed1a1c3107c9cb0f5b74bccda 100644 (file)
@@ -6,6 +6,7 @@ menu "Bus devices"
 
 config OMAP_OCP2SCP
        tristate "OMAP OCP2SCP DRIVER"
+       depends on ARCH_OMAP2PLUS
        help
          Driver to enable ocp2scp module which transforms ocp interface
          protocol to scp protocol. In OMAP4, USB PHY is connected via
index 7da840d487d27c338483f446a84104658ff993da..9978609d93b27ef2bdb92bafb72fdf481f1b089c 100644 (file)
@@ -38,8 +38,6 @@ static struct vio_device_id tpm_ibmvtpm_device_table[] = {
 };
 MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table);
 
-DECLARE_WAIT_QUEUE_HEAD(wq);
-
 /**
  * ibmvtpm_send_crq - Send a CRQ request
  * @vdev:      vio device struct
@@ -83,6 +81,7 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 {
        struct ibmvtpm_dev *ibmvtpm;
        u16 len;
+       int sig;
 
        ibmvtpm = (struct ibmvtpm_dev *)chip->vendor.data;
 
@@ -91,22 +90,23 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
                return 0;
        }
 
-       wait_event_interruptible(wq, ibmvtpm->crq_res.len != 0);
+       sig = wait_event_interruptible(ibmvtpm->wq, ibmvtpm->res_len != 0);
+       if (sig)
+               return -EINTR;
+
+       len = ibmvtpm->res_len;
 
-       if (count < ibmvtpm->crq_res.len) {
+       if (count < len) {
                dev_err(ibmvtpm->dev,
                        "Invalid size in recv: count=%ld, crq_size=%d\n",
-                       count, ibmvtpm->crq_res.len);
+                       count, len);
                return -EIO;
        }
 
        spin_lock(&ibmvtpm->rtce_lock);
-       memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, ibmvtpm->crq_res.len);
-       memset(ibmvtpm->rtce_buf, 0, ibmvtpm->crq_res.len);
-       ibmvtpm->crq_res.valid = 0;
-       ibmvtpm->crq_res.msg = 0;
-       len = ibmvtpm->crq_res.len;
-       ibmvtpm->crq_res.len = 0;
+       memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, len);
+       memset(ibmvtpm->rtce_buf, 0, len);
+       ibmvtpm->res_len = 0;
        spin_unlock(&ibmvtpm->rtce_lock);
        return len;
 }
@@ -273,7 +273,6 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev)
        int rc = 0;
 
        free_irq(vdev->irq, ibmvtpm);
-       tasklet_kill(&ibmvtpm->tasklet);
 
        do {
                if (rc)
@@ -372,7 +371,6 @@ static int ibmvtpm_reset_crq(struct ibmvtpm_dev *ibmvtpm)
 static int tpm_ibmvtpm_resume(struct device *dev)
 {
        struct ibmvtpm_dev *ibmvtpm = ibmvtpm_get_data(dev);
-       unsigned long flags;
        int rc = 0;
 
        do {
@@ -387,10 +385,11 @@ static int tpm_ibmvtpm_resume(struct device *dev)
                return rc;
        }
 
-       spin_lock_irqsave(&ibmvtpm->lock, flags);
-       vio_disable_interrupts(ibmvtpm->vdev);
-       tasklet_schedule(&ibmvtpm->tasklet);
-       spin_unlock_irqrestore(&ibmvtpm->lock, flags);
+       rc = vio_enable_interrupts(ibmvtpm->vdev);
+       if (rc) {
+               dev_err(dev, "Error vio_enable_interrupts rc=%d\n", rc);
+               return rc;
+       }
 
        rc = ibmvtpm_crq_send_init(ibmvtpm);
        if (rc)
@@ -467,7 +466,7 @@ static struct ibmvtpm_crq *ibmvtpm_crq_get_next(struct ibmvtpm_dev *ibmvtpm)
        if (crq->valid & VTPM_MSG_RES) {
                if (++crq_q->index == crq_q->num_entry)
                        crq_q->index = 0;
-               rmb();
+               smp_rmb();
        } else
                crq = NULL;
        return crq;
@@ -535,11 +534,9 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
                        ibmvtpm->vtpm_version = crq->data;
                        return;
                case VTPM_TPM_COMMAND_RES:
-                       ibmvtpm->crq_res.valid = crq->valid;
-                       ibmvtpm->crq_res.msg = crq->msg;
-                       ibmvtpm->crq_res.len = crq->len;
-                       ibmvtpm->crq_res.data = crq->data;
-                       wake_up_interruptible(&wq);
+                       /* len of the data in rtce buffer */
+                       ibmvtpm->res_len = crq->len;
+                       wake_up_interruptible(&ibmvtpm->wq);
                        return;
                default:
                        return;
@@ -559,38 +556,19 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
 static irqreturn_t ibmvtpm_interrupt(int irq, void *vtpm_instance)
 {
        struct ibmvtpm_dev *ibmvtpm = (struct ibmvtpm_dev *) vtpm_instance;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ibmvtpm->lock, flags);
-       vio_disable_interrupts(ibmvtpm->vdev);
-       tasklet_schedule(&ibmvtpm->tasklet);
-       spin_unlock_irqrestore(&ibmvtpm->lock, flags);
-
-       return IRQ_HANDLED;
-}
-
-/**
- * ibmvtpm_tasklet - Interrupt handler tasklet
- * @data:      ibm vtpm device struct
- *
- * Returns:
- *     Nothing
- **/
-static void ibmvtpm_tasklet(void *data)
-{
-       struct ibmvtpm_dev *ibmvtpm = data;
        struct ibmvtpm_crq *crq;
-       unsigned long flags;
 
-       spin_lock_irqsave(&ibmvtpm->lock, flags);
+       /* while loop is needed for initial setup (get version and
+        * get rtce_size). There should be only one tpm request at any
+        * given time.
+        */
        while ((crq = ibmvtpm_crq_get_next(ibmvtpm)) != NULL) {
                ibmvtpm_crq_process(crq, ibmvtpm);
                crq->valid = 0;
-               wmb();
+               smp_wmb();
        }
 
-       vio_enable_interrupts(ibmvtpm->vdev);
-       spin_unlock_irqrestore(&ibmvtpm->lock, flags);
+       return IRQ_HANDLED;
 }
 
 /**
@@ -650,9 +628,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
                goto reg_crq_cleanup;
        }
 
-       tasklet_init(&ibmvtpm->tasklet, (void *)ibmvtpm_tasklet,
-                    (unsigned long)ibmvtpm);
-
        rc = request_irq(vio_dev->irq, ibmvtpm_interrupt, 0,
                         tpm_ibmvtpm_driver_name, ibmvtpm);
        if (rc) {
@@ -666,13 +641,14 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
                goto init_irq_cleanup;
        }
 
+       init_waitqueue_head(&ibmvtpm->wq);
+
        crq_q->index = 0;
 
        ibmvtpm->dev = dev;
        ibmvtpm->vdev = vio_dev;
        chip->vendor.data = (void *)ibmvtpm;
 
-       spin_lock_init(&ibmvtpm->lock);
        spin_lock_init(&ibmvtpm->rtce_lock);
 
        rc = ibmvtpm_crq_send_init(ibmvtpm);
@@ -689,7 +665,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
 
        return rc;
 init_irq_cleanup:
-       tasklet_kill(&ibmvtpm->tasklet);
        do {
                rc1 = plpar_hcall_norets(H_FREE_CRQ, vio_dev->unit_address);
        } while (rc1 == H_BUSY || H_IS_LONG_BUSY(rc1));
index 4296eb4b4d82e24478268a9a4eaf55aa988b81d9..bd82a791f995d6657f15082bb3f3b957dbc499dc 100644 (file)
@@ -38,13 +38,12 @@ struct ibmvtpm_dev {
        struct vio_dev *vdev;
        struct ibmvtpm_crq_queue crq_queue;
        dma_addr_t crq_dma_handle;
-       spinlock_t lock;
-       struct tasklet_struct tasklet;
        u32 rtce_size;
        void __iomem *rtce_buf;
        dma_addr_t rtce_dma_handle;
        spinlock_t rtce_lock;
-       struct ibmvtpm_crq crq_res;
+       wait_queue_head_t wq;
+       u16 res_len;
        u32 vtpm_version;
 };
 
index 77629d33f03f2d087e7f9c1473fdaa2258a56877..febead4bf8a5d3b3c6f51711f4954c7aff78dba7 100644 (file)
@@ -544,6 +544,7 @@ config KEYBOARD_OMAP
 
 config KEYBOARD_OMAP4
        tristate "TI OMAP4+ keypad support"
+       depends on ARCH_OMAP2PLUS
        select INPUT_MATRIXKMAP
        help
          Say Y here if you want to use the OMAP4+ keypad.
index 7eb73c561bd2896d5c135a8c53e40e0ae352bad6..5de6e7f39f9cf4fd41faaf93bdb722b6420e9d3c 100644 (file)
@@ -6,6 +6,7 @@ comment "USB Physical Layer drivers"
 
 config OMAP_USB2
        tristate "OMAP USB2 PHY Driver"
+       depends on ARCH_OMAP2PLUS
        select USB_OTG_UTILS
        help
          Enable this to support the transceiver that is part of SOC. This
index 346d67d6cf4d5a24c6465388e4e7d166356dc5b3..b07b2b042e7e242e6ff515ef0348912106f8cccb 100644 (file)
@@ -1,6 +1,10 @@
 config OMAP2_VRFB
        bool
 
+if ARCH_OMAP2PLUS
+
 source "drivers/video/omap2/dss/Kconfig"
 source "drivers/video/omap2/omapfb/Kconfig"
 source "drivers/video/omap2/displays/Kconfig"
+
+endif
index c433a746e3f58317c8ea1e5187f70675e9dd0315..e8ca63a82b9777d6ae66cdcb34be199bd5fc7d7c 100644 (file)
@@ -60,6 +60,7 @@ config W1_MASTER_GPIO
 
 config HDQ_MASTER_OMAP
        tristate "OMAP HDQ driver"
+       depends on ARCH_OMAP
        help
          Say Y here if you want support for the 1-wire or HDQ Interface
          on an OMAP processor.
index 58db6df866ef3338f6fc787acd552fe03d0a0b69..af47e7594460a98afad10a5bb89e376fff509e7f 100644 (file)
@@ -338,9 +338,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
                                enum dma_data_direction dir,
                                struct dma_attrs *attrs)
 {
-       phys_addr_t phys = page_to_phys(page) + offset;
+       phys_addr_t map, phys = page_to_phys(page) + offset;
        dma_addr_t dev_addr = xen_phys_to_bus(phys);
-       void *map;
 
        BUG_ON(dir == DMA_NONE);
        /*
@@ -356,10 +355,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
         * Oh well, have to allocate and map a bounce buffer.
         */
        map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
-       if (!map)
+       if (map == SWIOTLB_MAP_ERROR)
                return DMA_ERROR_CODE;
 
-       dev_addr = xen_virt_to_bus(map);
+       dev_addr = xen_phys_to_bus(map);
 
        /*
         * Ensure that the address returned is DMA'ble
@@ -389,7 +388,7 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 
        /* NOTE: We use dev_addr here, not paddr! */
        if (is_xen_swiotlb_buffer(dev_addr)) {
-               swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+               swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
                return;
        }
 
@@ -434,8 +433,7 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
        /* NOTE: We use dev_addr here, not paddr! */
        if (is_xen_swiotlb_buffer(dev_addr)) {
-               swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
-                                      target);
+               swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
                return;
        }
 
@@ -494,11 +492,12 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
                if (swiotlb_force ||
                    !dma_capable(hwdev, dev_addr, sg->length) ||
                    range_straddles_page_boundary(paddr, sg->length)) {
-                       void *map = swiotlb_tbl_map_single(hwdev,
-                                                          start_dma_addr,
-                                                          sg_phys(sg),
-                                                          sg->length, dir);
-                       if (!map) {
+                       phys_addr_t map = swiotlb_tbl_map_single(hwdev,
+                                                                start_dma_addr,
+                                                                sg_phys(sg),
+                                                                sg->length,
+                                                                dir);
+                       if (map == SWIOTLB_MAP_ERROR) {
                                /* Don't panic here, we expect map_sg users
                                   to do proper error handling. */
                                xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
@@ -506,7 +505,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
                                sgl[0].dma_length = 0;
                                return DMA_ERROR_CODE;
                        }
-                       sg->dma_address = xen_virt_to_bus(map);
+                       sg->dma_address = xen_phys_to_bus(map);
                } else
                        sg->dma_address = dev_addr;
                sg->dma_length = sg->length;
index f95ae3a027f38dd3173d5c766eeeb0ecda58d2be..eaff24a19502aaf815cb8407152871da5d065665 100644 (file)
@@ -28,8 +28,8 @@ config FS_MBCACHE
        tristate
        default y if EXT2_FS=y && EXT2_FS_XATTR
        default y if EXT3_FS=y && EXT3_FS_XATTR
-       default y if EXT4_FS=y && EXT4_FS_XATTR
-       default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+       default y if EXT4_FS=y
+       default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
 
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
index 75c1ee6991433d90886f5d88dcb239f8ba2cafd6..5cbd00e740671dc13a0f4a07621513eb469762ac 100644 (file)
@@ -346,19 +346,15 @@ init_cifs_idmap(void)
        if (!cred)
                return -ENOMEM;
 
-       keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
-                           (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                           KEY_USR_VIEW | KEY_USR_READ,
-                           KEY_ALLOC_NOT_IN_QUOTA);
+       keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
+                               (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                               KEY_USR_VIEW | KEY_USR_READ,
+                               KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto failed_put_cred;
        }
 
-       ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-       if (ret < 0)
-               goto failed_put_key;
-
        ret = register_key_type(&cifs_idmap_key_type);
        if (ret < 0)
                goto failed_put_key;
index c22f17021b6eee7ca942a3525eb9f4fd23de6011..0a475c881852666d6f21b9a0fc82207d35b6be46 100644 (file)
@@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23
          compiled kernel size by using one file system driver for
          ext2, ext3, and ext4 file systems.
 
-config EXT4_FS_XATTR
-       bool "Ext4 extended attributes"
-       depends on EXT4_FS
-       default y
-       help
-         Extended attributes are name:value pairs associated with inodes by
-         the kernel or by users (see the attr(5) manual page, or visit
-         <http://acl.bestbits.at/> for details).
-
-         If unsure, say N.
-
-         You need this for POSIX ACL support on ext4.
-
 config EXT4_FS_POSIX_ACL
        bool "Ext4 POSIX Access Control Lists"
-       depends on EXT4_FS_XATTR
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL
 
 config EXT4_FS_SECURITY
        bool "Ext4 Security Labels"
-       depends on EXT4_FS_XATTR
        help
          Security labels support alternative access control models
          implemented by security modules like SELinux.  This option
index 56fd8f865930e8347e0d48b67b9e5435970efab4..0310fec2ee3dbd39ddf3b25ae47b89147fe40031 100644 (file)
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-               mmp.o indirect.o
+               mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+               xattr_trusted.o inline.o
 
-ext4-$(CONFIG_EXT4_FS_XATTR)           += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)                += xattr_security.o
index d3c5b88fd89f22f9061b4905b756445936e7076b..e6e0d988439bf20e4570f209e1b16b4ffdb18a21 100644 (file)
@@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 
 retry:
        handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
+       if (IS_ERR(handle)) {
+               error = PTR_ERR(handle);
+               goto release_and_out;
+       }
        error = ext4_set_acl(handle, inode, type, acl);
        ext4_journal_stop(handle);
        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
index 8e07d2a5a13952836c820c32419f340b148a8ce2..b8d877f6c1fa52991cb16d8bf76652e41e9dd72c 100644 (file)
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include "ext4.h"
-
-static unsigned char ext4_filetype_table[] = {
-       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
+#include "xattr.h"
 
 static int ext4_dx_readdir(struct file *filp,
                           void *dirent, filldir_t filldir);
 
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
-       if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-           (filetype >= EXT4_FT_MAX))
-               return DT_UNKNOWN;
-
-       return (ext4_filetype_table[filetype]);
-}
-
 /**
  * Check if the given dir-inode refers to an htree-indexed directory
  * (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
  * Return 0 if the directory entry is OK, and 1 if there is a problem
  *
  * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
  */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
-                          struct buffer_head *bh,
+                          struct buffer_head *bh, char *buf, int size,
                           unsigned int offset)
 {
        const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                error_msg = "rec_len % 4 != 0";
        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-       else if (unlikely(((char *) de - bh->b_data) + rlen >
-                         dir->i_sb->s_blocksize))
-               error_msg = "directory entry across blocks";
+       else if (unlikely(((char *) de - buf) + rlen > size))
+               error_msg = "directory entry across range";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                               error_msg, (unsigned) (offset % bh->b_size),
+                               error_msg, (unsigned) (offset % size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                               error_msg, (unsigned) (offset % bh->b_size),
+                               error_msg, (unsigned) (offset % size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
 
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
        int ret = 0;
        int dir_has_error = 0;
 
+       if (ext4_has_inline_data(inode)) {
+               int has_inline_data = 1;
+               ret = ext4_read_inline_dir(filp, dirent, filldir,
+                                          &has_inline_data);
+               if (has_inline_data)
+                       return ret;
+       }
+
        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                       if (ext4_check_dir_entry(inode, filp, de,
-                                                bh, offset)) {
+                       if (ext4_check_dir_entry(inode, filp, de, bh,
+                                                bh->b_data, bh->b_size,
+                                                offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
index df163da388c9ccfe82348c51be596c9fcfd8cc55..8462eb3c33aa22c62cccb88b030edfb599e2db66 100644 (file)
 #define ext4_debug(fmt, ...)   no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...)    printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...)    no_printk(fmt, ##__VA_ARGS__)
+#endif
+
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
 
@@ -392,6 +402,7 @@ struct flex_groups {
 #define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
 #define EXT4_EA_INODE_FL               0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL              0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL            0x10000000 /* Inode has inline data. */
 #define EXT4_RESERVED_FL               0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE           0x004BDFFF /* User visible flags */
@@ -448,28 +459,26 @@ enum {
        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+       EXT4_INODE_INLINE_DATA  = 28,   /* Data in inode. */
        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
 };
 
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
-#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
-       printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
-               EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
-
-/*
- * Since it's pretty easy to mix up bit numbers and hex values, and we
- * can't do a compile-time test for ENUM values, we use a run-time
- * test to make sure that EXT4_XXX_FL is consistent with respect to
- * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
- * out so it won't cost any extra space in the compiled kernel image.
- * But it's important that these values are the same, since we are
- * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
- * must be consistent with the values of FS_XXX_FL defined in
- * include/linux/fs.h and the on-disk values found in ext2, ext3, and
- * ext4 filesystems, and of course the values defined in e2fsprogs.
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
  *
  * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
  */
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
 static inline void ext4_check_flag_values(void)
 {
        CHECK_FLAG_VALUE(SECRM);
@@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void)
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(EOFBLOCKS);
+       CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(RESERVED);
 }
 
@@ -811,6 +821,8 @@ struct ext4_ext_cache {
        __u32           ec_len; /* must be 32bit to return holes */
 };
 
+#include "extents_status.h"
+
 /*
  * fourth extended file system inode data in memory
  */
@@ -833,7 +845,6 @@ struct ext4_inode_info {
 #endif
        unsigned long   i_flags;
 
-#ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
@@ -842,7 +853,6 @@ struct ext4_inode_info {
         * EAs.
         */
        struct rw_semaphore xattr_sem;
-#endif
 
        struct list_head i_orphan;      /* unlinked but open inodes */
 
@@ -888,6 +898,10 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
 
+       /* extents status tree */
+       struct ext4_es_tree i_es_tree;
+       rwlock_t i_es_lock;
+
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
 
@@ -902,6 +916,10 @@ struct ext4_inode_info {
        /* on-disk additional length */
        __u16 i_extra_isize;
 
+       /* Indicate the inline data space. */
+       u16 i_inline_off;
+       u16 i_inline_size;
+
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -1360,6 +1378,7 @@ enum {
        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
+       EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)                                \
@@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_DIRDATA          0x1000 /* data in dirent */
 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR         0x4000 /* >2GB or 3-lvl htree */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA       0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA      0x8000 /* data in inode */
 
 #define EXT2_FEATURE_COMPAT_SUPP       EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                        EXT4_FEATURE_INCOMPAT_MMP)
+                                        EXT4_FEATURE_INCOMPAT_MMP |    \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA)
 #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail {
        __le32  det_checksum;           /* crc32c(uuid+inum+dirblock) */
 };
 
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+       ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+                                       ((blocksize) - \
+                                        sizeof(struct ext4_dir_entry_tail))))
+
 /*
  * Ext4 directory file types.  Only the low 3 bits are used.  The
  * other bits are reserved for now.
@@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
-                                 struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, offset)                        \
+                                 struct buffer_head *, char *, int,
+                                 unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)     \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
-                                       (de), (bh), (offset)))
+                                       (de), (bh), (buf), (size), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                            struct buffer_head *bh,
+                            void *buf, int buf_size,
+                            const char *name, int namelen,
+                            struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+                       struct ext4_dir_entry_2 *de,
+                       int buf_size,
+                       const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+       if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                    EXT4_FEATURE_COMPAT_DIR_INDEX))
+               ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+       if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+           (filetype >= EXT4_FT_MAX))
+               return DT_UNKNOWN;
+
+       return ext4_filetype_table[filetype];
+}
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                        struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                          struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+                          struct buffer_head *head,
+                          unsigned from,
+                          unsigned to,
+                          int *partial,
+                          int (*fn)(handle_t *handle,
+                                    struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+                               struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA     2
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+                     char *search_buf,
+                     int buf_size,
+                     struct inode *dir,
+                     const struct qstr *d_name,
+                     unsigned int offset,
+                     struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+                                    struct inode *dir,
+                                    struct ext4_dir_entry_2 *de_del,
+                                    struct buffer_head *bh,
+                                    void *entry_buf,
+                                    int buf_size,
+                                    int csum_size);
 
 /* resize.c */
 extern int ext4_group_add(struct super_block *sb,
@@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode);
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
 extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+                                struct ext4_dir_entry_2 *de,
+                                int blocksize, int csum_size,
+                                unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+                                  unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+                                        struct inode *inode,
+                                        struct buffer_head *bh);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
 
 /* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+                                        ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                  int num,
+                                                  struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+                                     struct ext4_extent *ex1,
+                                     struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+                                 struct ext4_ext_path *,
+                                 struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+                                                 struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
+
+
 /* move_extent.c */
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
@@ -2445,14 +2558,10 @@ enum ext4_state_bits {
                                 * never, ever appear in a buffer_head's state
                                 * flag. See EXT4_MAP_FROM_CLUSTER to see where
                                 * this is used. */
-       BH_Da_Mapped,   /* Delayed allocated block that now has a mapping. This
-                        * flag is set when ext4_map_blocks is called on a
-                        * delayed allocated block to get its real mapping. */
 };
 
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
-BUFFER_FNS(Da_Mapped, da_mapped)
 
 /*
  * Add new method to test whether block and inode bitmaps are properly
@@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif /* __KERNEL__ */
 
-#include "ext4_extents.h"
-
 #endif /* _EXT4_H */
index cb1b2c919963290fd10d09ba12f6d8c53ace9fa6..487fda12bc00223f8533ce0699d96e8fd207df4e 100644 (file)
  */
 #define CHECK_BINSEARCH__
 
-/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...)    printk(fmt, ##__VA_ARGS__)
-#else
-#define ext_debug(fmt, ...)    no_printk(fmt, ##__VA_ARGS__)
-#endif
-
 /*
  * If EXT_STATS is defined then stats numbers are collected.
  * These number will be displayed at umount time.
@@ -143,20 +133,6 @@ struct ext4_ext_path {
  * structure for external API
  */
 
-/*
- * to be called by ext4_ext_walk_space()
- * negative retcode - error
- * positive retcode - signal for ext4_ext_walk_space(), see below
- * callback must return valid extent (passed or newly created)
- */
-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
-                                       struct ext4_ext_cache *,
-                                       struct ext4_extent *, void *);
-
-#define EXT_CONTINUE   0
-#define EXT_BREAK      1
-#define EXT_REPEAT     2
-
 /*
  * Maximum number of logical blocks in a file; ext4_extent's ee_block is
  * __le32.
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                     0xffff);
 }
 
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                        ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
-                                                  int num,
-                                                  struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
-                                     struct ext4_extent *ex1,
-                                     struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-                                                       struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
-                                     int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
 
index 56d258c1830363d99ccf2edf7743633acf510942..7177f9b21cb2b58cbfece8c8b636281dc5acff6d 100644 (file)
@@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
                handle->h_sync = 1;
 }
 
-static inline void ext4_handle_release_buffer(handle_t *handle,
-                                               struct buffer_head *bh)
-{
-       if (ext4_handle_valid(handle))
-               jbd2_journal_release_buffer(handle, bh);
-}
-
 static inline int ext4_handle_is_aborted(handle_t *handle)
 {
        if (ext4_handle_valid(handle))
index 7011ac967208e941272f09a07e3292d72ef576f9..26af22832a846d43d94c20e67e92a2ab825d835c 100644 (file)
@@ -41,6 +41,8 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
 
 #include <trace/events/ext4.h>
 
@@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
                             int split_flag,
                             int flags);
 
+static int ext4_find_delayed_extent(struct inode *inode,
+                                   struct ext4_ext_cache *newex);
+
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -1959,27 +1964,33 @@ cleanup:
        return err;
 }
 
-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                              ext4_lblk_t num, ext_prepare_callback func,
-                              void *cbdata)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+                                   ext4_lblk_t block, ext4_lblk_t num,
+                                   struct fiemap_extent_info *fieinfo)
 {
        struct ext4_ext_path *path = NULL;
-       struct ext4_ext_cache cbex;
+       struct ext4_ext_cache newex;
        struct ext4_extent *ex;
-       ext4_lblk_t next, start = 0, end = 0;
+       ext4_lblk_t next, next_del, start = 0, end = 0;
        ext4_lblk_t last = block + num;
-       int depth, exists, err = 0;
-
-       BUG_ON(func == NULL);
-       BUG_ON(inode == NULL);
+       int exists, depth = 0, err = 0;
+       unsigned int flags = 0;
+       unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
 
        while (block < last && block != EXT_MAX_BLOCKS) {
                num = last - block;
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
+
+               if (path && ext_depth(inode) != depth) {
+                       /* depth was changed. we have to realloc path */
+                       kfree(path);
+                       path = NULL;
+               }
+
                path = ext4_ext_find_extent(inode, block, path);
-               up_read(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
+                       up_read(&EXT4_I(inode)->i_data_sem);
                        err = PTR_ERR(path);
                        path = NULL;
                        break;
@@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 
                depth = ext_depth(inode);
                if (unlikely(path[depth].p_hdr == NULL)) {
+                       up_read(&EXT4_I(inode)->i_data_sem);
                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                        err = -EIO;
                        break;
                }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
+               ext4_ext_drop_refs(path);
 
+               flags = 0;
                exists = 0;
                if (!ex) {
                        /* there is no extent yet, so try to allocate
@@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                BUG_ON(end <= start);
 
                if (!exists) {
-                       cbex.ec_block = start;
-                       cbex.ec_len = end - start;
-                       cbex.ec_start = 0;
+                       newex.ec_block = start;
+                       newex.ec_len = end - start;
+                       newex.ec_start = 0;
                } else {
-                       cbex.ec_block = le32_to_cpu(ex->ee_block);
-                       cbex.ec_len = ext4_ext_get_actual_len(ex);
-                       cbex.ec_start = ext4_ext_pblock(ex);
+                       newex.ec_block = le32_to_cpu(ex->ee_block);
+                       newex.ec_len = ext4_ext_get_actual_len(ex);
+                       newex.ec_start = ext4_ext_pblock(ex);
+                       if (ext4_ext_is_uninitialized(ex))
+                               flags |= FIEMAP_EXTENT_UNWRITTEN;
                }
 
-               if (unlikely(cbex.ec_len == 0)) {
-                       EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
-                       err = -EIO;
-                       break;
+               /*
+                * Find delayed extent and update newex accordingly. We call
+                * it even in !exists case to find out whether newex is the
+                * last existing extent or not.
+                */
+               next_del = ext4_find_delayed_extent(inode, &newex);
+               if (!exists && next_del) {
+                       exists = 1;
+                       flags |= FIEMAP_EXTENT_DELALLOC;
                }
-               err = func(inode, next, &cbex, ex, cbdata);
-               ext4_ext_drop_refs(path);
+               up_read(&EXT4_I(inode)->i_data_sem);
 
-               if (err < 0)
+               if (unlikely(newex.ec_len == 0)) {
+                       EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+                       err = -EIO;
                        break;
+               }
 
-               if (err == EXT_REPEAT)
-                       continue;
-               else if (err == EXT_BREAK) {
-                       err = 0;
-                       break;
+               /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
+               if (next == next_del) {
+                       flags |= FIEMAP_EXTENT_LAST;
+                       if (unlikely(next_del != EXT_MAX_BLOCKS ||
+                                    next != EXT_MAX_BLOCKS)) {
+                               EXT4_ERROR_INODE(inode,
+                                                "next extent == %u, next "
+                                                "delalloc extent = %u",
+                                                next, next_del);
+                               err = -EIO;
+                               break;
+                       }
                }
 
-               if (ext_depth(inode) != depth) {
-                       /* depth was changed. we have to realloc path */
-                       kfree(path);
-                       path = NULL;
+               if (exists) {
+                       err = fiemap_fill_next_extent(fieinfo,
+                               (__u64)newex.ec_block << blksize_bits,
+                               (__u64)newex.ec_start << blksize_bits,
+                               (__u64)newex.ec_len << blksize_bits,
+                               flags);
+                       if (err < 0)
+                               break;
+                       if (err == 1) {
+                               err = 0;
+                               break;
+                       }
                }
 
-               block = cbex.ec_block + cbex.ec_len;
+               block = newex.ec_block + newex.ec_len;
        }
 
        if (path) {
@@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                  struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-       struct ext4_sb_info *sbi;
        int ret = 0;
 
        /*
@@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-       sbi = EXT4_SB(inode->i_sb);
 
        /* has cache valid data? */
        if (cex->ec_len == 0)
@@ -2273,7 +2309,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        int index;
-       int depth = ext_depth(inode);
+       int depth;
+
+       /* If we are converting the inline data, only one is needed here. */
+       if (ext4_has_inline_data(inode))
+               return 1;
+
+       depth = ext_depth(inode);
 
        if (chunk)
                index = depth * 2;
@@ -3461,115 +3503,34 @@ out:
 /**
  * ext4_find_delalloc_range: find delayed allocated block in the given range.
  *
- * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
- * whether there are any buffers marked for delayed allocation. It returns '1'
- * on the first delalloc'ed buffer head found. If no buffer head in the given
- * range is marked for delalloc, it returns 0.
- * lblk_start should always be <= lblk_end.
- * search_hint_reverse is to indicate that searching in reverse from lblk_end to
- * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
- * block sooner). This is useful when blocks are truncated sequentially from
- * lblk_start towards lblk_end.
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
  */
 static int ext4_find_delalloc_range(struct inode *inode,
                                    ext4_lblk_t lblk_start,
-                                   ext4_lblk_t lblk_end,
-                                   int search_hint_reverse)
+                                   ext4_lblk_t lblk_end)
 {
-       struct address_space *mapping = inode->i_mapping;
-       struct buffer_head *head, *bh = NULL;
-       struct page *page;
-       ext4_lblk_t i, pg_lblk;
-       pgoff_t index;
-
-       if (!test_opt(inode->i_sb, DELALLOC))
-               return 0;
-
-       /* reverse search wont work if fs block size is less than page size */
-       if (inode->i_blkbits < PAGE_CACHE_SHIFT)
-               search_hint_reverse = 0;
+       struct extent_status es;
 
-       if (search_hint_reverse)
-               i = lblk_end;
+       es.start = lblk_start;
+       ext4_es_find_extent(inode, &es);
+       if (es.len == 0)
+               return 0; /* there is no delay extent in this tree */
+       else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+               return 1;
+       else if (lblk_start <= es.start && es.start <= lblk_end)
+               return 1;
        else
-               i = lblk_start;
-
-       index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-       while ((i >= lblk_start) && (i <= lblk_end)) {
-               page = find_get_page(mapping, index);
-               if (!page)
-                       goto nextpage;
-
-               if (!page_has_buffers(page))
-                       goto nextpage;
-
-               head = page_buffers(page);
-               if (!head)
-                       goto nextpage;
-
-               bh = head;
-               pg_lblk = index << (PAGE_CACHE_SHIFT -
-                                               inode->i_blkbits);
-               do {
-                       if (unlikely(pg_lblk < lblk_start)) {
-                               /*
-                                * This is possible when fs block size is less
-                                * than page size and our cluster starts/ends in
-                                * middle of the page. So we need to skip the
-                                * initial few blocks till we reach the 'lblk'
-                                */
-                               pg_lblk++;
-                               continue;
-                       }
-
-                       /* Check if the buffer is delayed allocated and that it
-                        * is not yet mapped. (when da-buffers are mapped during
-                        * their writeout, their da_mapped bit is set.)
-                        */
-                       if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
-                               page_cache_release(page);
-                               trace_ext4_find_delalloc_range(inode,
-                                               lblk_start, lblk_end,
-                                               search_hint_reverse,
-                                               1, i);
-                               return 1;
-                       }
-                       if (search_hint_reverse)
-                               i--;
-                       else
-                               i++;
-               } while ((i >= lblk_start) && (i <= lblk_end) &&
-                               ((bh = bh->b_this_page) != head));
-nextpage:
-               if (page)
-                       page_cache_release(page);
-               /*
-                * Move to next page. 'i' will be the first lblk in the next
-                * page.
-                */
-               if (search_hint_reverse)
-                       index--;
-               else
-                       index++;
-               i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       }
-
-       trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
-                                       search_hint_reverse, 0, 0);
-       return 0;
+               return 0;
 }
 
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
-                              int search_hint_reverse)
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;
        lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 
-       return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
-                                       search_hint_reverse);
+       return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
 }
 
 /**
@@ -3630,7 +3591,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
                lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
                lblk_to = lblk_from + c_offset - 1;
 
-               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
                        allocated_clusters--;
        }
 
@@ -3640,7 +3601,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
                lblk_from = lblk_start + num_blks;
                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
 
-               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
                        allocated_clusters--;
        }
 
@@ -3663,8 +3624,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
 
-       trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
-                                                   newblock);
+       trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+                                                   allocated, newblock);
 
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3911,7 +3872,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent newex, *ex, *ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-       int free_on_err = 0, err = 0, depth, ret;
+       int free_on_err = 0, err = 0, depth;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
@@ -3927,7 +3888,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((sbi->s_cluster_ratio > 1) &&
-                           ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+                           ext4_find_delalloc_cluster(inode, map->m_lblk))
                                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -4007,15 +3968,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                        ee_len, ee_start);
                                goto out;
                        }
-                       ret = ext4_ext_handle_uninitialized_extents(
+                       allocated = ext4_ext_handle_uninitialized_extents(
                                handle, inode, map, path, flags,
                                allocated, newblock);
-                       return ret;
+                       goto out3;
                }
        }
 
        if ((sbi->s_cluster_ratio > 1) &&
-           ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+           ext4_find_delalloc_cluster(inode, map->m_lblk))
                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 
        /*
@@ -4284,8 +4245,8 @@ out2:
                kfree(path);
        }
 
-       trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-               newblock, map->m_len, err ? err : allocated);
+out3:
+       trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
 
        return err ? err : allocated;
 }
@@ -4344,6 +4305,8 @@ void ext4_ext_truncate(struct inode *inode)
 
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
+       err = ext4_es_remove_extent(inode, last_block,
+                                   EXT_MAX_BLOCKS - last_block);
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 
        /* In a multi-transaction truncate, we only make the final
@@ -4434,6 +4397,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (mode & FALLOC_FL_PUNCH_HOLE)
                return ext4_punch_hole(file, offset, len);
 
+       ret = ext4_convert_inline_data(inode);
+       if (ret)
+               return ret;
+
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
@@ -4572,206 +4539,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 }
 
 /*
- * Callback function called for each extent to gather FIEMAP information.
+ * If newex is not existing extent (newex->ec_start equals zero) find
+ * delayed extent at start of newex and update newex accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newex is existing extent (newex->ec_start is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newex unmodified.
  */
-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
-                      struct ext4_ext_cache *newex, struct ext4_extent *ex,
-                      void *data)
+static int ext4_find_delayed_extent(struct inode *inode,
+                                   struct ext4_ext_cache *newex)
 {
-       __u64   logical;
-       __u64   physical;
-       __u64   length;
-       __u32   flags = 0;
-       int             ret = 0;
-       struct fiemap_extent_info *fieinfo = data;
-       unsigned char blksize_bits;
+       struct extent_status es;
+       ext4_lblk_t next_del;
 
-       blksize_bits = inode->i_sb->s_blocksize_bits;
-       logical = (__u64)newex->ec_block << blksize_bits;
+       es.start = newex->ec_block;
+       next_del = ext4_es_find_extent(inode, &es);
 
        if (newex->ec_start == 0) {
                /*
                 * No extent in extent-tree contains block @newex->ec_start,
                 * then the block may stay in 1)a hole or 2)delayed-extent.
-                *
-                * Holes or delayed-extents are processed as follows.
-                * 1. lookup dirty pages with specified range in pagecache.
-                *    If no page is got, then there is no delayed-extent and
-                *    return with EXT_CONTINUE.
-                * 2. find the 1st mapped buffer,
-                * 3. check if the mapped buffer is both in the request range
-                *    and a delayed buffer. If not, there is no delayed-extent,
-                *    then return.
-                * 4. a delayed-extent is found, the extent will be collected.
                 */
-               ext4_lblk_t     end = 0;
-               pgoff_t         last_offset;
-               pgoff_t         offset;
-               pgoff_t         index;
-               pgoff_t         start_index = 0;
-               struct page     **pages = NULL;
-               struct buffer_head *bh = NULL;
-               struct buffer_head *head = NULL;
-               unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
-
-               pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
-               if (pages == NULL)
-                       return -ENOMEM;
-
-               offset = logical >> PAGE_SHIFT;
-repeat:
-               last_offset = offset;
-               head = NULL;
-               ret = find_get_pages_tag(inode->i_mapping, &offset,
-                                       PAGECACHE_TAG_DIRTY, nr_pages, pages);
-
-               if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
-                       /* First time, try to find a mapped buffer. */
-                       if (ret == 0) {
-out:
-                               for (index = 0; index < ret; index++)
-                                       page_cache_release(pages[index]);
-                               /* just a hole. */
-                               kfree(pages);
-                               return EXT_CONTINUE;
-                       }
-                       index = 0;
-
-next_page:
-                       /* Try to find the 1st mapped buffer. */
-                       end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
-                                 blksize_bits;
-                       if (!page_has_buffers(pages[index]))
-                               goto out;
-                       head = page_buffers(pages[index]);
-                       if (!head)
-                               goto out;
-
-                       index++;
-                       bh = head;
-                       do {
-                               if (end >= newex->ec_block +
-                                       newex->ec_len)
-                                       /* The buffer is out of
-                                        * the request range.
-                                        */
-                                       goto out;
-
-                               if (buffer_mapped(bh) &&
-                                   end >= newex->ec_block) {
-                                       start_index = index - 1;
-                                       /* get the 1st mapped buffer. */
-                                       goto found_mapped_buffer;
-                               }
-
-                               bh = bh->b_this_page;
-                               end++;
-                       } while (bh != head);
-
-                       /* No mapped buffer in the range found in this page,
-                        * We need to look up next page.
-                        */
-                       if (index >= ret) {
-                               /* There is no page left, but we need to limit
-                                * newex->ec_len.
-                                */
-                               newex->ec_len = end - newex->ec_block;
-                               goto out;
-                       }
-                       goto next_page;
-               } else {
-                       /*Find contiguous delayed buffers. */
-                       if (ret > 0 && pages[0]->index == last_offset)
-                               head = page_buffers(pages[0]);
-                       bh = head;
-                       index = 1;
-                       start_index = 0;
-               }
-
-found_mapped_buffer:
-               if (bh != NULL && buffer_delay(bh)) {
-                       /* 1st or contiguous delayed buffer found. */
-                       if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
-                               /*
-                                * 1st delayed buffer found, record
-                                * the start of extent.
-                                */
-                               flags |= FIEMAP_EXTENT_DELALLOC;
-                               newex->ec_block = end;
-                               logical = (__u64)end << blksize_bits;
-                       }
-                       /* Find contiguous delayed buffers. */
-                       do {
-                               if (!buffer_delay(bh))
-                                       goto found_delayed_extent;
-                               bh = bh->b_this_page;
-                               end++;
-                       } while (bh != head);
-
-                       for (; index < ret; index++) {
-                               if (!page_has_buffers(pages[index])) {
-                                       bh = NULL;
-                                       break;
-                               }
-                               head = page_buffers(pages[index]);
-                               if (!head) {
-                                       bh = NULL;
-                                       break;
-                               }
-
-                               if (pages[index]->index !=
-                                   pages[start_index]->index + index
-                                   - start_index) {
-                                       /* Blocks are not contiguous. */
-                                       bh = NULL;
-                                       break;
-                               }
-                               bh = head;
-                               do {
-                                       if (!buffer_delay(bh))
-                                               /* Delayed-extent ends. */
-                                               goto found_delayed_extent;
-                                       bh = bh->b_this_page;
-                                       end++;
-                               } while (bh != head);
-                       }
-               } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
-                       /* a hole found. */
-                       goto out;
+               if (es.len == 0)
+                       /* A hole found. */
+                       return 0;
 
-found_delayed_extent:
-               newex->ec_len = min(end - newex->ec_block,
-                                               (ext4_lblk_t)EXT_INIT_MAX_LEN);
-               if (ret == nr_pages && bh != NULL &&
-                       newex->ec_len < EXT_INIT_MAX_LEN &&
-                       buffer_delay(bh)) {
-                       /* Have not collected an extent and continue. */
-                       for (index = 0; index < ret; index++)
-                               page_cache_release(pages[index]);
-                       goto repeat;
+               if (es.start > newex->ec_block) {
+                       /* A hole found. */
+                       newex->ec_len = min(es.start - newex->ec_block,
+                                           newex->ec_len);
+                       return 0;
                }
 
-               for (index = 0; index < ret; index++)
-                       page_cache_release(pages[index]);
-               kfree(pages);
+               newex->ec_len = es.start + es.len - newex->ec_block;
        }
 
-       physical = (__u64)newex->ec_start << blksize_bits;
-       length =   (__u64)newex->ec_len << blksize_bits;
-
-       if (ex && ext4_ext_is_uninitialized(ex))
-               flags |= FIEMAP_EXTENT_UNWRITTEN;
-
-       if (next == EXT_MAX_BLOCKS)
-               flags |= FIEMAP_EXTENT_LAST;
-
-       ret = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                       length, flags);
-       if (ret < 0)
-               return ret;
-       if (ret == 1)
-               return EXT_BREAK;
-       return EXT_CONTINUE;
+       return next_del;
 }
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4971,6 +4775,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        ext4_ext_invalidate_cache(inode);
        ext4_discard_preallocations(inode);
 
+       err = ext4_es_remove_extent(inode, first_block,
+                                   stop_block - first_block);
        err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
 
        ext4_ext_invalidate_cache(inode);
@@ -4991,12 +4797,22 @@ out_mutex:
        mutex_unlock(&inode->i_mutex);
        return err;
 }
+
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
        ext4_lblk_t start_blk;
        int error = 0;
 
+       if (ext4_has_inline_data(inode)) {
+               int has_inline = 1;
+
+               error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+               if (has_inline)
+                       return error;
+       }
+
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
@@ -5018,11 +4834,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
 
                /*
-                * Walk the extent tree gathering extent information.
-                * ext4_ext_fiemap_cb will push extents back to user.
+                * Walk the extent tree gathering extent information
+                * and pushing extents back to the user.
                 */
-               error = ext4_ext_walk_space(inode, start_blk, len_blks,
-                                         ext4_ext_fiemap_cb, fieinfo);
+               error = ext4_fill_fiemap_extents(inode, start_blk,
+                                                len_blks, fieinfo);
        }
 
        return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644 (file)
index 0000000..564d981
--- /dev/null
@@ -0,0 +1,500 @@
+/*
+ *  fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *     Allison Henderson <achender@linux.vnet.ibm.com>
+ *     Hugh Dickins <hughd@google.com>
+ *     Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include "ext4.h"
+#include "extents_status.h"
+#include "ext4_extents.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal.  It is
+ * original built by Yongqiang Yang.  At that time it is called delay
+ * extent tree, whose goal is only track delay extent in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
+ * delay extent tree at the following comment.  But for better
+ * understand what it does, it has been rename to extent status tree.
+ *
+ * Currently the first step has been done.  All delay extents are
+ * tracked in the tree.  It maintains the delay extent when a delay
+ * allocation is issued, and the delay extent is written out or
+ * invalidated.  Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ */
+
+/*
+ * extents status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extents status encompass delayed extents and extent locks
+ *
+ * 1. Why delayed extent implementation ?
+ *
+ * Without delayed extent, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
+ * to know if a block or a range of blocks are belonged to a delayed
+ * extent.
+ *
+ * Let us have a look at how they do without delayed extents implementation.
+ *   --        FIEMAP
+ *     FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ *   --        SEEK_HOLE/DATA
+ *     SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ *   --        bigalloc
+ *     bigalloc looks up page cache to figure out if a block is
+ *     already under delayed allocation or not to determine whether
+ *     quota reserving is needed for the cluster.
+ *
+ *   -- punch hole
+ *     punch hole looks up page cache to identify a delayed extent.
+ *
+ *   --        writeout
+ *     Writeout looks up whole page cache to see if a buffer is
+ *     mapped, If there are not very many delayed buffers, then it is
+ *     time comsuming.
+ *
+ * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the delayed extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. ext4 delayed extents impelmentation
+ *
+ *   --        delayed extent
+ *     A delayed extent is a range of blocks which are contiguous
+ *     logically and under delayed allocation.  Unlike extent in
+ *     ext4, delayed extent in ext4 is a in-memory struct, there is
+ *     no corresponding on-disk data.  There is no limit on length of
+ *     delayed extent, so a delayed extent can contain as many blocks
+ *     as they are contiguous logically.
+ *
+ *   --        delayed extent tree
+ *     Every inode has a delayed extent tree and all under delayed
+ *     allocation blocks are added to the tree as delayed extents.
+ *     Delayed extents in the tree are ordered by logical block no.
+ *
+ *   --        operations on a delayed extent tree
+ *     There are three operations on a delayed extent tree: find next
+ *     delayed extent, adding a space(a range of blocks) and removing
+ *     a space.
+ *
+ *   --        race on a delayed extent tree
+ *     Delayed extent tree is protected inode->i_es_lock.
+ *
+ *
+ * ==========================================================================
+ * 3. performance analysis
+ *   --        overhead
+ *     1. There is a cache extent for write access, so if writes are
+ *     not very random, adding space operaions are in O(1) time.
+ *
+ *   --        gain
+ *     2. Code is much simpler, more readable, more maintainable and
+ *     more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *   -- Track all extent status
+ *
+ *   -- Improve get block process
+ *
+ *   -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+int __init ext4_init_es(void)
+{
+       ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+       if (ext4_es_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void ext4_exit_es(void)
+{
+       if (ext4_es_cachep)
+               kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+       tree->root = RB_ROOT;
+       tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+       struct ext4_es_tree *tree;
+       struct rb_node *node;
+
+       printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+       tree = &EXT4_I(inode)->i_es_tree;
+       node = rb_first(&tree->root);
+       while (node) {
+               struct extent_status *es;
+               es = rb_entry(node, struct extent_status, rb_node);
+               printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+               node = rb_next(node);
+       }
+       printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+{
+       BUG_ON(es->start + es->len < es->start);
+       return es->start + es->len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset.  If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+                                             ext4_lblk_t offset)
+{
+       struct rb_node *node = root->rb_node;
+       struct extent_status *es = NULL;
+
+       while (node) {
+               es = rb_entry(node, struct extent_status, rb_node);
+               if (offset < es->start)
+                       node = node->rb_left;
+               else if (offset > extent_status_end(es))
+                       node = node->rb_right;
+               else
+                       return es;
+       }
+
+       if (es && offset < es->start)
+               return es;
+
+       if (es && offset > extent_status_end(es)) {
+               node = rb_next(&es->rb_node);
+               return node ? rb_entry(node, struct extent_status, rb_node) :
+                             NULL;
+       }
+
+       return NULL;
+}
+
+/*
+ * ext4_es_find_extent: find the 1st delayed extent covering @es->start
+ * if it exists, otherwise, the next extent after @es->start.
+ *
+ * @inode: the inode which owns delayed extents
+ * @es: delayed extent that we found
+ *
+ * Returns the first block of the next extent after es, otherwise
+ * EXT_MAX_BLOCKS if no delay extent is found.
+ * Delayed extent is returned via @es.
+ */
+ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+{
+       struct ext4_es_tree *tree = NULL;
+       struct extent_status *es1 = NULL;
+       struct rb_node *node;
+       ext4_lblk_t ret = EXT_MAX_BLOCKS;
+
+       trace_ext4_es_find_extent_enter(inode, es->start);
+
+       read_lock(&EXT4_I(inode)->i_es_lock);
+       tree = &EXT4_I(inode)->i_es_tree;
+
+       /* find delay extent in cache firstly */
+       if (tree->cache_es) {
+               es1 = tree->cache_es;
+               if (in_range(es->start, es1->start, es1->len)) {
+                       es_debug("%u cached by [%u/%u)\n",
+                                es->start, es1->start, es1->len);
+                       goto out;
+               }
+       }
+
+       es->len = 0;
+       es1 = __es_tree_search(&tree->root, es->start);
+
+out:
+       if (es1) {
+               tree->cache_es = es1;
+               es->start = es1->start;
+               es->len = es1->len;
+               node = rb_next(&es1->rb_node);
+               if (node) {
+                       es1 = rb_entry(node, struct extent_status, rb_node);
+                       ret = es1->start;
+               }
+       }
+
+       read_unlock(&EXT4_I(inode)->i_es_lock);
+
+       trace_ext4_es_find_extent_exit(inode, es, ret);
+       return ret;
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+{
+       struct extent_status *es;
+       es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+       if (es == NULL)
+               return NULL;
+       es->start = start;
+       es->len = len;
+       return es;
+}
+
+static void ext4_es_free_extent(struct extent_status *es)
+{
+       kmem_cache_free(ext4_es_cachep, es);
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+{
+       struct extent_status *es1;
+       struct rb_node *node;
+
+       node = rb_prev(&es->rb_node);
+       if (!node)
+               return es;
+
+       es1 = rb_entry(node, struct extent_status, rb_node);
+       if (es->start == extent_status_end(es1) + 1) {
+               es1->len += es->len;
+               rb_erase(&es->rb_node, &tree->root);
+               ext4_es_free_extent(es);
+               es = es1;
+       }
+
+       return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+{
+       struct extent_status *es1;
+       struct rb_node *node;
+
+       node = rb_next(&es->rb_node);
+       if (!node)
+               return es;
+
+       es1 = rb_entry(node, struct extent_status, rb_node);
+       if (es1->start == extent_status_end(es) + 1) {
+               es->len += es1->len;
+               rb_erase(node, &tree->root);
+               ext4_es_free_extent(es1);
+       }
+
+       return es;
+}
+
+static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
+                             ext4_lblk_t len)
+{
+       struct rb_node **p = &tree->root.rb_node;
+       struct rb_node *parent = NULL;
+       struct extent_status *es;
+       ext4_lblk_t end = offset + len - 1;
+
+       BUG_ON(end < offset);
+       es = tree->cache_es;
+       if (es && offset == (extent_status_end(es) + 1)) {
+               es_debug("cached by [%u/%u)\n", es->start, es->len);
+               es->len += len;
+               es = ext4_es_try_to_merge_right(tree, es);
+               goto out;
+       } else if (es && es->start == end + 1) {
+               es_debug("cached by [%u/%u)\n", es->start, es->len);
+               es->start = offset;
+               es->len += len;
+               es = ext4_es_try_to_merge_left(tree, es);
+               goto out;
+       } else if (es && es->start <= offset &&
+                  end <= extent_status_end(es)) {
+               es_debug("cached by [%u/%u)\n", es->start, es->len);
+               goto out;
+       }
+
+       while (*p) {
+               parent = *p;
+               es = rb_entry(parent, struct extent_status, rb_node);
+
+               if (offset < es->start) {
+                       if (es->start == end + 1) {
+                               es->start = offset;
+                               es->len += len;
+                               es = ext4_es_try_to_merge_left(tree, es);
+                               goto out;
+                       }
+                       p = &(*p)->rb_left;
+               } else if (offset > extent_status_end(es)) {
+                       if (offset == extent_status_end(es) + 1) {
+                               es->len += len;
+                               es = ext4_es_try_to_merge_right(tree, es);
+                               goto out;
+                       }
+                       p = &(*p)->rb_right;
+               } else {
+                       if (extent_status_end(es) <= end)
+                               es->len = offset - es->start + len;
+                       goto out;
+               }
+       }
+
+       es = ext4_es_alloc_extent(offset, len);
+       if (!es)
+               return -ENOMEM;
+       rb_link_node(&es->rb_node, parent, p);
+       rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+       tree->cache_es = es;
+       return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds a space to a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * ext4_es_insert_extent is called by ext4_da_write_begin and
+ * ext4_es_remove_extent.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
+                         ext4_lblk_t len)
+{
+       struct ext4_es_tree *tree;
+       int err = 0;
+
+       trace_ext4_es_insert_extent(inode, offset, len);
+       es_debug("add [%u/%u) to extent status tree of inode %lu\n",
+                offset, len, inode->i_ino);
+
+       write_lock(&EXT4_I(inode)->i_es_lock);
+       tree = &EXT4_I(inode)->i_es_tree;
+       err = __es_insert_extent(tree, offset, len);
+       write_unlock(&EXT4_I(inode)->i_es_lock);
+
+       ext4_es_print_tree(inode);
+
+       return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
+                         ext4_lblk_t len)
+{
+       struct rb_node *node;
+       struct ext4_es_tree *tree;
+       struct extent_status *es;
+       struct extent_status orig_es;
+       ext4_lblk_t len1, len2, end;
+       int err = 0;
+
+       trace_ext4_es_remove_extent(inode, offset, len);
+       es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+                offset, len, inode->i_ino);
+
+       end = offset + len - 1;
+       BUG_ON(end < offset);
+       write_lock(&EXT4_I(inode)->i_es_lock);
+       tree = &EXT4_I(inode)->i_es_tree;
+       es = __es_tree_search(&tree->root, offset);
+       if (!es)
+               goto out;
+       if (es->start > end)
+               goto out;
+
+       /* Simply invalidate cache_es. */
+       tree->cache_es = NULL;
+
+       orig_es.start = es->start;
+       orig_es.len = es->len;
+       len1 = offset > es->start ? offset - es->start : 0;
+       len2 = extent_status_end(es) > end ?
+              extent_status_end(es) - end : 0;
+       if (len1 > 0)
+               es->len = len1;
+       if (len2 > 0) {
+               if (len1 > 0) {
+                       err = __es_insert_extent(tree, end + 1, len2);
+                       if (err) {
+                               es->start = orig_es.start;
+                               es->len = orig_es.len;
+                               goto out;
+                       }
+               } else {
+                       es->start = end + 1;
+                       es->len = len2;
+               }
+               goto out;
+       }
+
+       if (len1 > 0) {
+               node = rb_next(&es->rb_node);
+               if (node)
+                       es = rb_entry(node, struct extent_status, rb_node);
+               else
+                       es = NULL;
+       }
+
+       while (es && extent_status_end(es) <= end) {
+               node = rb_next(&es->rb_node);
+               rb_erase(&es->rb_node, &tree->root);
+               ext4_es_free_extent(es);
+               if (!node) {
+                       es = NULL;
+                       break;
+               }
+               es = rb_entry(node, struct extent_status, rb_node);
+       }
+
+       if (es && es->start < end + 1) {
+               len1 = extent_status_end(es) - end;
+               es->start = end + 1;
+               es->len = len1;
+       }
+
+out:
+       write_unlock(&EXT4_I(inode)->i_es_lock);
+       ext4_es_print_tree(inode);
+       return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644 (file)
index 0000000..077f82d
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ *  fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *     Allison Henderson <achender@linux.vnet.ibm.com>
+ *     Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...)     printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+struct extent_status {
+       struct rb_node rb_node;
+       ext4_lblk_t start;      /* first block extent covers */
+       ext4_lblk_t len;        /* length of extent in block */
+};
+
+struct ext4_es_tree {
+       struct rb_root root;
+       struct extent_status *cache_es; /* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t len);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t len);
+extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
+                               struct extent_status *es);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
index bf3966bccd343393656ea394f790ba79f183f93f..b64a60bf105a0c884c104b251ba59a6e0ef93d1b 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/path.h>
 #include <linux/quotaops.h>
+#include <linux/pagevec.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -285,6 +286,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        return dquot_file_open(inode, filp);
 }
 
+/*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function.  When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+                                    int origin,
+                                    struct ext4_map_blocks *map,
+                                    loff_t *offset)
+{
+       struct pagevec pvec;
+       unsigned int blkbits;
+       pgoff_t index;
+       pgoff_t end;
+       loff_t endoff;
+       loff_t startoff;
+       loff_t lastoff;
+       int found = 0;
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       startoff = *offset;
+       lastoff = startoff;
+       endoff = (map->m_lblk + map->m_len) << blkbits;
+
+       index = startoff >> PAGE_CACHE_SHIFT;
+       end = endoff >> PAGE_CACHE_SHIFT;
+
+       pagevec_init(&pvec, 0);
+       do {
+               int i, num;
+               unsigned long nr_pages;
+
+               num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+                                         (pgoff_t)num);
+               if (nr_pages == 0) {
+                       if (origin == SEEK_DATA)
+                               break;
+
+                       BUG_ON(origin != SEEK_HOLE);
+                       /*
+                        * If this is the first time to go into the loop and
+                        * offset is not beyond the end offset, it will be a
+                        * hole at this offset
+                        */
+                       if (lastoff == startoff || lastoff < endoff)
+                               found = 1;
+                       break;
+               }
+
+               /*
+                * If this is the first time to go into the loop and
+                * offset is smaller than the first page offset, it will be a
+                * hole at this offset.
+                */
+               if (lastoff == startoff && origin == SEEK_HOLE &&
+                   lastoff < page_offset(pvec.pages[0])) {
+                       found = 1;
+                       break;
+               }
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       struct buffer_head *bh, *head;
+
+                       /*
+                        * If the current offset is not beyond the end of given
+                        * range, it will be a hole.
+                        */
+                       if (lastoff < endoff && origin == SEEK_HOLE &&
+                           page->index > end) {
+                               found = 1;
+                               *offset = lastoff;
+                               goto out;
+                       }
+
+                       lock_page(page);
+
+                       if (unlikely(page->mapping != inode->i_mapping)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (!page_has_buffers(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (page_has_buffers(page)) {
+                               lastoff = page_offset(page);
+                               bh = head = page_buffers(page);
+                               do {
+                                       if (buffer_uptodate(bh) ||
+                                           buffer_unwritten(bh)) {
+                                               if (origin == SEEK_DATA)
+                                                       found = 1;
+                                       } else {
+                                               if (origin == SEEK_HOLE)
+                                                       found = 1;
+                                       }
+                                       if (found) {
+                                               *offset = max_t(loff_t,
+                                                       startoff, lastoff);
+                                               unlock_page(page);
+                                               goto out;
+                                       }
+                                       lastoff += bh->b_size;
+                                       bh = bh->b_this_page;
+                               } while (bh != head);
+                       }
+
+                       lastoff = page_offset(page) + PAGE_SIZE;
+                       unlock_page(page);
+               }
+
+               /*
+                * The no. of pages is less than our desired, that would be a
+                * hole in there.
+                */
+               if (nr_pages < num && origin == SEEK_HOLE) {
+                       found = 1;
+                       *offset = lastoff;
+                       break;
+               }
+
+               index = pvec.pages[i - 1]->index + 1;
+               pagevec_release(&pvec);
+       } while (index <= end);
+
+out:
+       pagevec_release(&pvec);
+       return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ext4_map_blocks map;
+       struct extent_status es;
+       ext4_lblk_t start, last, end;
+       loff_t dataoff, isize;
+       int blkbits;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       isize = i_size_read(inode);
+       if (offset >= isize) {
+               mutex_unlock(&inode->i_mutex);
+               return -ENXIO;
+       }
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       start = offset >> blkbits;
+       last = start;
+       end = isize >> blkbits;
+       dataoff = offset;
+
+       do {
+               map.m_lblk = last;
+               map.m_len = end - last + 1;
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                       if (last != start)
+                               dataoff = last << blkbits;
+                       break;
+               }
+
+               /*
+                * If there is a delay extent at this offset,
+                * it will be as a data.
+                */
+               es.start = last;
+               (void)ext4_es_find_extent(inode, &es);
+               if (last >= es.start &&
+                   last < es.start + es.len) {
+                       if (last != start)
+                               dataoff = last << blkbits;
+                       break;
+               }
+
+               /*
+                * If there is a unwritten extent at this offset,
+                * it will be as a data or a hole according to page
+                * cache that has data or not.
+                */
+               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                       int unwritten;
+                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                             &map, &dataoff);
+                       if (unwritten)
+                               break;
+               }
+
+               last++;
+               dataoff = last << blkbits;
+       } while (last <= end);
+
+       mutex_unlock(&inode->i_mutex);
+
+       if (dataoff > isize)
+               return -ENXIO;
+
+       if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               return -EINVAL;
+       if (dataoff > maxsize)
+               return -EINVAL;
+
+       if (dataoff != file->f_pos) {
+               file->f_pos = dataoff;
+               file->f_version = 0;
+       }
+
+       return dataoff;
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ext4_map_blocks map;
+       struct extent_status es;
+       ext4_lblk_t start, last, end;
+       loff_t holeoff, isize;
+       int blkbits;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       isize = i_size_read(inode);
+       if (offset >= isize) {
+               mutex_unlock(&inode->i_mutex);
+               return -ENXIO;
+       }
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       start = offset >> blkbits;
+       last = start;
+       end = isize >> blkbits;
+       holeoff = offset;
+
+       do {
+               map.m_lblk = last;
+               map.m_len = end - last + 1;
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                       last += ret;
+                       holeoff = last << blkbits;
+                       continue;
+               }
+
+               /*
+                * If there is a delay extent at this offset,
+                * we will skip this extent.
+                */
+               es.start = last;
+               (void)ext4_es_find_extent(inode, &es);
+               if (last >= es.start &&
+                   last < es.start + es.len) {
+                       last = es.start + es.len;
+                       holeoff = last << blkbits;
+                       continue;
+               }
+
+               /*
+                * If there is a unwritten extent at this offset,
+                * it will be as a data or a hole according to page
+                * cache that has data or not.
+                */
+               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                       int unwritten;
+                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                                             &map, &holeoff);
+                       if (!unwritten) {
+                               last += ret;
+                               holeoff = last << blkbits;
+                               continue;
+                       }
+               }
+
+               /* find a hole */
+               break;
+       } while (last <= end);
+
+       mutex_unlock(&inode->i_mutex);
+
+       if (holeoff > isize)
+               holeoff = isize;
+
+       if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               return -EINVAL;
+       if (holeoff > maxsize)
+               return -EINVAL;
+
+       if (holeoff != file->f_pos) {
+               file->f_pos = holeoff;
+               file->f_version = 0;
+       }
+
+       return holeoff;
+}
+
 /*
  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
  * by calling generic_file_llseek_size() with the appropriate maxbytes
@@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
        else
                maxbytes = inode->i_sb->s_maxbytes;
 
-       return generic_file_llseek_size(file, offset, origin,
-                                       maxbytes, i_size_read(inode));
+       switch (origin) {
+       case SEEK_SET:
+       case SEEK_CUR:
+       case SEEK_END:
+               return generic_file_llseek_size(file, offset, origin,
+                                               maxbytes, i_size_read(inode));
+       case SEEK_DATA:
+               return ext4_seek_data(file, offset, maxbytes);
+       case SEEK_HOLE:
+               return ext4_seek_hole(file, offset, maxbytes);
+       }
+
+       return -EINVAL;
 }
 
 const struct file_operations ext4_file_operations = {
@@ -326,12 +656,10 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
        .get_acl        = ext4_get_acl,
        .fiemap         = ext4_fiemap,
 };
index be1d89f385b42e2268f879f8ee6f8478bb967081..dfbc1fe9667487518d965ecb361d1724f822f983 100644 (file)
@@ -44,7 +44,6 @@
  */
 static int ext4_sync_parent(struct inode *inode)
 {
-       struct writeback_control wbc;
        struct dentry *dentry = NULL;
        struct inode *next;
        int ret = 0;
@@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
                ret = sync_mapping_buffers(inode->i_mapping);
                if (ret)
                        break;
-               memset(&wbc, 0, sizeof(wbc));
-               wbc.sync_mode = WB_SYNC_ALL;
-               wbc.nr_to_write = 0;         /* only write out the inode */
-               ret = sync_inode(inode, &wbc);
+               ret = sync_inode_metadata(inode, 1);
                if (ret)
                        break;
        }
index 3a100e7a62a8343d31912a423330e11ea40e2e2a..3f32c80124470e772c8f79dce042ccadde3bd66f 100644 (file)
@@ -762,7 +762,6 @@ got:
 
                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
-               brelse(block_bitmap_bh);
 
                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
@@ -775,6 +774,7 @@ got:
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
+               brelse(block_bitmap_bh);
 
                if (err)
                        goto fail;
@@ -902,6 +902,10 @@ got:
 
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
+       ei->i_inline_off = 0;
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+               ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
        ret = inode;
        dquot_initialize(inode);
        err = dquot_alloc_inode(inode);
index 792e388e7b444bece02de64261be376227bd22bc..20862f96e8ae0e79dbf2a031990c68b315260a71 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include "ext4_extents.h"      /* Needed for EXT_MAX_BLOCKS */
 
 #include <trace/events/ext4.h>
 
@@ -755,8 +756,7 @@ cleanup:
                partial--;
        }
 out:
-       trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
-                               map->m_pblk, map->m_len, err);
+       trace_ext4_ind_map_blocks_exit(inode, map, err);
        return err;
 }
 
@@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
        down_write(&ei->i_data_sem);
 
        ext4_discard_preallocations(inode);
+       ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
 
        /*
         * The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644 (file)
index 0000000..387c47c
--- /dev/null
@@ -0,0 +1,1884 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA "data"
+#define EXT4_MIN_INLINE_DATA_SIZE      ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_SIZE        4
+
+int ext4_get_inline_size(struct inode *inode)
+{
+       if (EXT4_I(inode)->i_inline_off)
+               return EXT4_I(inode)->i_inline_size;
+
+       return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+                                          struct ext4_iloc *iloc)
+{
+       struct ext4_xattr_ibody_header *header;
+       struct ext4_xattr_entry *entry;
+       struct ext4_inode *raw_inode;
+       int free, min_offs;
+
+       min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+                       EXT4_GOOD_OLD_INODE_SIZE -
+                       EXT4_I(inode)->i_extra_isize -
+                       sizeof(struct ext4_xattr_ibody_header);
+
+       /*
+        * We need to subtract another sizeof(__u32) since an in-inode xattr
+        * needs an empty 4 bytes to indicate the gap between the xattr entry
+        * and the name/value pair.
+        */
+       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+               return EXT4_XATTR_SIZE(min_offs -
+                       EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+                       EXT4_XATTR_ROUND - sizeof(__u32));
+
+       raw_inode = ext4_raw_inode(iloc);
+       header = IHDR(inode, raw_inode);
+       entry = IFIRST(header);
+
+       /* Compute min_offs. */
+       for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+               if (!entry->e_value_block && entry->e_value_size) {
+                       size_t offs = le16_to_cpu(entry->e_value_offs);
+                       if (offs < min_offs)
+                               min_offs = offs;
+               }
+       }
+       free = min_offs -
+               ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+       if (EXT4_I(inode)->i_inline_off) {
+               entry = (struct ext4_xattr_entry *)
+                       ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+               free += le32_to_cpu(entry->e_value_size);
+               goto out;
+       }
+
+       free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+       if (free > EXT4_XATTR_ROUND)
+               free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+       else
+               free = 0;
+
+out:
+       return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+       int error, max_inline_size;
+       struct ext4_iloc iloc;
+
+       if (EXT4_I(inode)->i_extra_isize == 0)
+               return 0;
+
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error) {
+               ext4_error_inode(inode, __func__, __LINE__, 0,
+                                "can't get inode location %lu",
+                                inode->i_ino);
+               return 0;
+       }
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+       up_read(&EXT4_I(inode)->xattr_sem);
+
+       brelse(iloc.bh);
+
+       if (!max_inline_size)
+               return 0;
+
+       return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+       return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+              EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = -ENODATA, },
+       };
+       struct ext4_xattr_info i = {
+               .name_index = EXT4_XATTR_INDEX_SYSTEM,
+               .name = EXT4_XATTR_SYSTEM_DATA,
+       };
+       int error;
+
+       if (EXT4_I(inode)->i_extra_isize == 0)
+               return 0;
+
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+               return error;
+
+       error = ext4_xattr_ibody_find(inode, &i, &is);
+       if (error)
+               goto out;
+
+       if (!is.s.not_found) {
+               EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                       (void *)ext4_raw_inode(&is.iloc));
+               EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+                               le32_to_cpu(is.s.here->e_value_size);
+               ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+       }
+out:
+       brelse(is.iloc.bh);
+       return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+                                unsigned int len,
+                                struct ext4_iloc *iloc)
+{
+       struct ext4_xattr_entry *entry;
+       struct ext4_xattr_ibody_header *header;
+       int cp_len = 0;
+       struct ext4_inode *raw_inode;
+
+       if (!len)
+               return 0;
+
+       BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+       cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+                       len : EXT4_MIN_INLINE_DATA_SIZE;
+
+       raw_inode = ext4_raw_inode(iloc);
+       memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+       len -= cp_len;
+       buffer += cp_len;
+
+       if (!len)
+               goto out;
+
+       header = IHDR(inode, raw_inode);
+       entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                           EXT4_I(inode)->i_inline_off);
+       len = min_t(unsigned int, len,
+                   (unsigned int)le32_to_cpu(entry->e_value_size));
+
+       memcpy(buffer,
+              (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+       cp_len += len;
+
+out:
+       return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+                           void *buffer, loff_t pos, unsigned int len)
+{
+       struct ext4_xattr_entry *entry;
+       struct ext4_xattr_ibody_header *header;
+       struct ext4_inode *raw_inode;
+       int cp_len = 0;
+
+       BUG_ON(!EXT4_I(inode)->i_inline_off);
+       BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+       raw_inode = ext4_raw_inode(iloc);
+       buffer += pos;
+
+       if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+               cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+                        EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+               memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+               len -= cp_len;
+               buffer += cp_len;
+               pos += cp_len;
+       }
+
+       if (!len)
+               return;
+
+       pos -= EXT4_MIN_INLINE_DATA_SIZE;
+       header = IHDR(inode, raw_inode);
+       entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                           EXT4_I(inode)->i_inline_off);
+
+       memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+              buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+                                  struct inode *inode, unsigned len)
+{
+       int error;
+       void *value = NULL;
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = -ENODATA, },
+       };
+       struct ext4_xattr_info i = {
+               .name_index = EXT4_XATTR_INDEX_SYSTEM,
+               .name = EXT4_XATTR_SYSTEM_DATA,
+       };
+
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+               return error;
+
+       error = ext4_journal_get_write_access(handle, is.iloc.bh);
+       if (error)
+               goto out;
+
+       if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+               value = EXT4_ZERO_XATTR_VALUE;
+               len -= EXT4_MIN_INLINE_DATA_SIZE;
+       } else {
+               value = "";
+               len = 0;
+       }
+
+       /* Insert the the xttr entry. */
+       i.value = value;
+       i.value_len = len;
+
+       error = ext4_xattr_ibody_find(inode, &i, &is);
+       if (error)
+               goto out;
+
+       BUG_ON(!is.s.not_found);
+
+       error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+       if (error) {
+               if (error == -ENOSPC)
+                       ext4_clear_inode_state(inode,
+                                              EXT4_STATE_MAY_INLINE_DATA);
+               goto out;
+       }
+
+       memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+               0, EXT4_MIN_INLINE_DATA_SIZE);
+
+       EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                     (void *)ext4_raw_inode(&is.iloc));
+       EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+       ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+       ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+       get_bh(is.iloc.bh);
+       error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+       brelse(is.iloc.bh);
+       return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+                                  unsigned int len)
+{
+       int error;
+       void *value = NULL;
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = -ENODATA, },
+       };
+       struct ext4_xattr_info i = {
+               .name_index = EXT4_XATTR_INDEX_SYSTEM,
+               .name = EXT4_XATTR_SYSTEM_DATA,
+       };
+
+       /* If the old space is ok, write the data directly. */
+       if (len <= EXT4_I(inode)->i_inline_size)
+               return 0;
+
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+               return error;
+
+       error = ext4_xattr_ibody_find(inode, &i, &is);
+       if (error)
+               goto out;
+
+       BUG_ON(is.s.not_found);
+
+       len -= EXT4_MIN_INLINE_DATA_SIZE;
+       value = kzalloc(len, GFP_NOFS);
+       if (!value)
+               goto out;
+
+       error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+                                    value, len);
+       if (error == -ENODATA)
+               goto out;
+
+       error = ext4_journal_get_write_access(handle, is.iloc.bh);
+       if (error)
+               goto out;
+
+       /* Update the xttr entry. */
+       i.value = value;
+       i.value_len = len;
+
+       error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+       if (error)
+               goto out;
+
+       EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                     (void *)ext4_raw_inode(&is.iloc));
+       EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+                               le32_to_cpu(is.s.here->e_value_size);
+       ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+       get_bh(is.iloc.bh);
+       error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+       kfree(value);
+       brelse(is.iloc.bh);
+       return error;
+}
+
+int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+                            unsigned int len)
+{
+       int ret, size;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+
+       if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+               return -ENOSPC;
+
+       size = ext4_get_max_inline_size(inode);
+       if (size < len)
+               return -ENOSPC;
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+
+       if (ei->i_inline_off)
+               ret = ext4_update_inline_data(handle, inode, len);
+       else
+               ret = ext4_create_inline_data(handle, inode, len);
+
+       up_write(&EXT4_I(inode)->xattr_sem);
+
+       return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+                                          struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = 0, },
+       };
+       struct ext4_xattr_info i = {
+               .name_index = EXT4_XATTR_INDEX_SYSTEM,
+               .name = EXT4_XATTR_SYSTEM_DATA,
+               .value = NULL,
+               .value_len = 0,
+       };
+       int error;
+
+       if (!ei->i_inline_off)
+               return 0;
+
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+               return error;
+
+       error = ext4_xattr_ibody_find(inode, &i, &is);
+       if (error)
+               goto out;
+
+       error = ext4_journal_get_write_access(handle, is.iloc.bh);
+       if (error)
+               goto out;
+
+       error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+       if (error)
+               goto out;
+
+       memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+               0, EXT4_MIN_INLINE_DATA_SIZE);
+
+       if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                     EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+               if (S_ISDIR(inode->i_mode) ||
+                   S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+                       ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+                       ext4_ext_tree_init(handle, inode);
+               }
+       }
+       ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+       get_bh(is.iloc.bh);
+       error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+       EXT4_I(inode)->i_inline_off = 0;
+       EXT4_I(inode)->i_inline_size = 0;
+       ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+       brelse(is.iloc.bh);
+       if (error == -ENODATA)
+               error = 0;
+       return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+       void *kaddr;
+       int ret = 0;
+       size_t len;
+       struct ext4_iloc iloc;
+
+       BUG_ON(!PageLocked(page));
+       BUG_ON(!ext4_has_inline_data(inode));
+       BUG_ON(page->index);
+
+       if (!EXT4_I(inode)->i_inline_off) {
+               ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+                            inode->i_ino);
+               goto out;
+       }
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               goto out;
+
+       len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+       kaddr = kmap_atomic(page);
+       ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+       flush_dcache_page(page);
+       kunmap_atomic(kaddr);
+       zero_user_segment(page, len, PAGE_CACHE_SIZE);
+       SetPageUptodate(page);
+       brelse(iloc.bh);
+
+out:
+       return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+       int ret = 0;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               up_read(&EXT4_I(inode)->xattr_sem);
+               return -EAGAIN;
+       }
+
+       /*
+        * Current inline data can only exist in the 1st page,
+        * So for all the other pages, just set them uptodate.
+        */
+       if (!page->index)
+               ret = ext4_read_inline_page(inode, page);
+       else if (!PageUptodate(page)) {
+               zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+               SetPageUptodate(page);
+       }
+
+       up_read(&EXT4_I(inode)->xattr_sem);
+
+       unlock_page(page);
+       return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+                                             struct inode *inode,
+                                             unsigned flags)
+{
+       int ret, needed_blocks;
+       handle_t *handle = NULL;
+       int retries = 0, sem_held = 0;
+       struct page *page = NULL;
+       unsigned from, to;
+       struct ext4_iloc iloc;
+
+       if (!ext4_has_inline_data(inode)) {
+               /*
+                * clear the flag so that no new write
+                * will trap here again.
+                */
+               ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+               return 0;
+       }
+
+       needed_blocks = ext4_writepage_trans_blocks(inode);
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+retry:
+       handle = ext4_journal_start(inode, needed_blocks);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               handle = NULL;
+               goto out;
+       }
+
+       /* We cannot recurse into the filesystem as the transaction is already
+        * started */
+       flags |= AOP_FLAG_NOFS;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       sem_held = 1;
+       /* If some one has already done this for us, just exit. */
+       if (!ext4_has_inline_data(inode)) {
+               ret = 0;
+               goto out;
+       }
+
+       from = 0;
+       to = ext4_get_inline_size(inode);
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out;
+       }
+
+       ret = ext4_destroy_inline_data_nolock(handle, inode);
+       if (ret)
+               goto out;
+
+       if (ext4_should_dioread_nolock(inode))
+               ret = __block_write_begin(page, from, to, ext4_get_block_write);
+       else
+               ret = __block_write_begin(page, from, to, ext4_get_block);
+
+       if (!ret && ext4_should_journal_data(inode)) {
+               ret = ext4_walk_page_buffers(handle, page_buffers(page),
+                                            from, to, NULL,
+                                            do_journal_get_write_access);
+       }
+
+       if (ret) {
+               unlock_page(page);
+               page_cache_release(page);
+               ext4_orphan_add(handle, inode);
+               up_write(&EXT4_I(inode)->xattr_sem);
+               sem_held = 0;
+               ext4_journal_stop(handle);
+               handle = NULL;
+               ext4_truncate_failed_write(inode);
+               /*
+                * If truncate failed early the inode might
+                * still be on the orphan list; we need to
+                * make sure the inode is removed from the
+                * orphan list in that case.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
+
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+
+       block_commit_write(page, from, to);
+out:
+       if (page) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       if (sem_held)
+               up_write(&EXT4_I(inode)->xattr_sem);
+       if (handle)
+               ext4_journal_stop(handle);
+       brelse(iloc.bh);
+       return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+                                 struct inode *inode,
+                                 loff_t pos, unsigned len,
+                                 unsigned flags,
+                                 struct page **pagep)
+{
+       int ret;
+       handle_t *handle;
+       struct page *page;
+       struct ext4_iloc iloc;
+
+       if (pos + len > ext4_get_max_inline_size(inode))
+               goto convert;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       /*
+        * The possible write could happen in the inode,
+        * so try to reserve the space in inode first.
+        */
+       handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               handle = NULL;
+               goto out;
+       }
+
+       ret = ext4_prepare_inline_data(handle, inode, pos + len);
+       if (ret && ret != -ENOSPC)
+               goto out;
+
+       /* We don't have space in inline inode, so convert it to extent. */
+       if (ret == -ENOSPC) {
+               ext4_journal_stop(handle);
+               brelse(iloc.bh);
+               goto convert;
+       }
+
+       flags |= AOP_FLAG_NOFS;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       *pagep = page;
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               ret = 0;
+               unlock_page(page);
+               page_cache_release(page);
+               goto out_up_read;
+       }
+
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out_up_read;
+       }
+
+       ret = 1;
+       handle = NULL;
+out_up_read:
+       up_read(&EXT4_I(inode)->xattr_sem);
+out:
+       if (handle)
+               ext4_journal_stop(handle);
+       brelse(iloc.bh);
+       return ret;
+convert:
+       return ext4_convert_inline_data_to_extent(mapping,
+                                                 inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+                              unsigned copied, struct page *page)
+{
+       int ret;
+       void *kaddr;
+       struct ext4_iloc iloc;
+
+       if (unlikely(copied < len)) {
+               if (!PageUptodate(page)) {
+                       copied = 0;
+                       goto out;
+               }
+       }
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret) {
+               ext4_std_error(inode->i_sb, ret);
+               copied = 0;
+               goto out;
+       }
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       BUG_ON(!ext4_has_inline_data(inode));
+
+       kaddr = kmap_atomic(page);
+       ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+       kunmap_atomic(kaddr);
+       SetPageUptodate(page);
+       /* clear page dirty so that writepages wouldn't work for us. */
+       ClearPageDirty(page);
+
+       up_write(&EXT4_I(inode)->xattr_sem);
+       brelse(iloc.bh);
+out:
+       return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+                                 unsigned len,
+                                 struct page *page)
+{
+       int ret;
+       void *kaddr;
+       struct ext4_iloc iloc;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret) {
+               ext4_std_error(inode->i_sb, ret);
+               return NULL;
+       }
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       kaddr = kmap_atomic(page);
+       ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+       kunmap_atomic(kaddr);
+       up_write(&EXT4_I(inode)->xattr_sem);
+
+       return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ *    clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ *    update and dirty so that ext4_da_writepages can handle it. We don't
+ *    need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+                                                struct inode *inode,
+                                                unsigned flags,
+                                                void **fsdata)
+{
+       int ret = 0, inline_size;
+       struct page *page;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page)
+               return -ENOMEM;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+               goto out;
+       }
+
+       inline_size = ext4_get_inline_size(inode);
+
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out;
+       }
+
+       ret = __block_write_begin(page, 0, inline_size,
+                                 ext4_da_get_block_prep);
+       if (ret) {
+               ext4_truncate_failed_write(inode);
+               goto out;
+       }
+
+       SetPageDirty(page);
+       SetPageUptodate(page);
+       ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+       *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+       up_read(&EXT4_I(inode)->xattr_sem);
+       if (page) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+                                   struct inode *inode,
+                                   loff_t pos, unsigned len,
+                                   unsigned flags,
+                                   struct page **pagep,
+                                   void **fsdata)
+{
+       int ret, inline_size;
+       handle_t *handle;
+       struct page *page;
+       struct ext4_iloc iloc;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               handle = NULL;
+               goto out;
+       }
+
+       inline_size = ext4_get_max_inline_size(inode);
+
+       ret = -ENOSPC;
+       if (inline_size >= pos + len) {
+               ret = ext4_prepare_inline_data(handle, inode, pos + len);
+               if (ret && ret != -ENOSPC)
+                       goto out;
+       }
+
+       if (ret == -ENOSPC) {
+               ret = ext4_da_convert_inline_data_to_extent(mapping,
+                                                           inode,
+                                                           flags,
+                                                           fsdata);
+               goto out;
+       }
+
+       /*
+        * We cannot recurse into the filesystem as the transaction
+        * is already started.
+        */
+       flags |= AOP_FLAG_NOFS;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               ret = 0;
+               goto out_release_page;
+       }
+
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out_release_page;
+       }
+
+       up_read(&EXT4_I(inode)->xattr_sem);
+       *pagep = page;
+       handle = NULL;
+       brelse(iloc.bh);
+       return 1;
+out_release_page:
+       up_read(&EXT4_I(inode)->xattr_sem);
+       unlock_page(page);
+       page_cache_release(page);
+out:
+       if (handle)
+               ext4_journal_stop(handle);
+       brelse(iloc.bh);
+       return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+                                 unsigned len, unsigned copied,
+                                 struct page *page)
+{
+       int i_size_changed = 0;
+
+       copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+       /*
+        * No need to use i_size_read() here, the i_size
+        * cannot change under us because we hold i_mutex.
+        *
+        * But it's important to update i_size while still holding page lock:
+        * page writeout could otherwise come in and zero beyond i_size.
+        */
+       if (pos+copied > inode->i_size) {
+               i_size_write(inode, pos+copied);
+               i_size_changed = 1;
+       }
+       unlock_page(page);
+       page_cache_release(page);
+
+       /*
+        * Don't mark the inode dirty under page lock. First, it unnecessarily
+        * makes the holding time of page lock longer. Second, it forces lock
+        * ordering of page lock and transaction start for journaling
+        * filesystems.
+        */
+       if (i_size_changed)
+               mark_inode_dirty(inode);
+
+       return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+                         void *inline_start, int inline_size)
+{
+       int offset;
+       unsigned short de_len;
+       struct ext4_dir_entry_2 *de = inline_start;
+       void *dlimit = inline_start + inline_size;
+
+       trace_printk("inode %lu\n", dir->i_ino);
+       offset = 0;
+       while ((void *)de < dlimit) {
+               de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+               trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+                            offset, de_len, de->name_len, de->name,
+                            de->name_len, le32_to_cpu(de->inode));
+               if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                        inline_start, inline_size, offset))
+                       BUG();
+
+               offset += de_len;
+               de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+       }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+                                    struct dentry *dentry,
+                                    struct inode *inode,
+                                    struct ext4_iloc *iloc,
+                                    void *inline_start, int inline_size)
+{
+       struct inode    *dir = dentry->d_parent->d_inode;
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+       unsigned short  reclen;
+       int             err;
+       struct ext4_dir_entry_2 *de;
+
+       reclen = EXT4_DIR_REC_LEN(namelen);
+       err = ext4_find_dest_de(dir, inode, iloc->bh,
+                               inline_start, inline_size,
+                               name, namelen, &de);
+       if (err)
+               return err;
+
+       err = ext4_journal_get_write_access(handle, iloc->bh);
+       if (err)
+               return err;
+       ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+       ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+       /*
+        * XXX shouldn't update any times until successful
+        * completion of syscall, but too many callers depend
+        * on this.
+        *
+        * XXX similarly, too many callers depend on
+        * ext4_new_inode() setting the times, but error
+        * recovery deletes the inode, so the worst that can
+        * happen is that the times are slightly out of date
+        * and/or different from the directory change time.
+        */
+       dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+       ext4_update_dx_flag(dir);
+       dir->i_version++;
+       ext4_mark_inode_dirty(handle, dir);
+       return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+                                      struct ext4_iloc *iloc)
+{
+       struct ext4_xattr_entry *entry;
+       struct ext4_xattr_ibody_header *header;
+
+       BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+       header = IHDR(inode, ext4_raw_inode(iloc));
+       entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+                                           EXT4_I(inode)->i_inline_off);
+
+       return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+       struct ext4_dir_entry_2 *de, *prev_de;
+       void *limit;
+       int de_len;
+
+       de = (struct ext4_dir_entry_2 *)de_buf;
+       if (old_size) {
+               limit = de_buf + old_size;
+               do {
+                       prev_de = de;
+                       de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+                       de_buf += de_len;
+                       de = (struct ext4_dir_entry_2 *)de_buf;
+               } while (de_buf < limit);
+
+               prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+                                                       old_size, new_size);
+       } else {
+               /* this is just created, so create an empty entry. */
+               de->inode = 0;
+               de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+       }
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+                                 struct ext4_iloc *iloc)
+{
+       int ret;
+       int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+       int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+       if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+               return -ENOSPC;
+
+       ret = ext4_update_inline_data(handle, dir,
+                                     new_size + EXT4_MIN_INLINE_DATA_SIZE);
+       if (ret)
+               return ret;
+
+       ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+                            EXT4_I(dir)->i_inline_size -
+                                               EXT4_MIN_INLINE_DATA_SIZE);
+       dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+       return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+                                    struct ext4_iloc *iloc,
+                                    void *buf, int inline_size)
+{
+       ext4_create_inline_data(handle, inode, inline_size);
+       ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+       ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+                                         struct inode *inode,
+                                         struct buffer_head *dir_block,
+                                         void *buf,
+                                         int inline_size)
+{
+       int err, csum_size = 0, header_size = 0;
+       struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
+       void *target = dir_block->b_data;
+
+       /*
+        * First create "." and ".." and then copy the dir information
+        * back to the block.
+        */
+       de = (struct ext4_dir_entry_2 *)target;
+       de = ext4_init_dot_dotdot(inode, de,
+               inode->i_sb->s_blocksize, csum_size,
+               le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+       header_size = (void *)de - target;
+
+       memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+               inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
+       inode->i_size = inode->i_sb->s_blocksize;
+       i_size_write(inode, inode->i_sb->s_blocksize);
+       EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+       ext4_update_final_de(dir_block->b_data,
+                       inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+                       inode->i_sb->s_blocksize - csum_size);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(dir_block->b_data,
+                                    inode->i_sb->s_blocksize);
+               initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+       }
+       set_buffer_uptodate(dir_block);
+       err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+       if (err)
+               goto out;
+       set_buffer_verified(dir_block);
+out:
+       return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+                                          struct inode *inode,
+                                          struct ext4_iloc *iloc)
+{
+       int error;
+       void *buf = NULL;
+       struct buffer_head *data_bh = NULL;
+       struct ext4_map_blocks map;
+       int inline_size;
+
+       inline_size = ext4_get_inline_size(inode);
+       buf = kmalloc(inline_size, GFP_NOFS);
+       if (!buf) {
+               error = -ENOMEM;
+               goto out;
+       }
+
+       error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+       if (error < 0)
+               goto out;
+
+       error = ext4_destroy_inline_data_nolock(handle, inode);
+       if (error)
+               goto out;
+
+       map.m_lblk = 0;
+       map.m_len = 1;
+       map.m_flags = 0;
+       error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+       if (error < 0)
+               goto out_restore;
+       if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+               error = -EIO;
+               goto out_restore;
+       }
+
+       data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+       if (!data_bh) {
+               error = -EIO;
+               goto out_restore;
+       }
+
+       lock_buffer(data_bh);
+       error = ext4_journal_get_create_access(handle, data_bh);
+       if (error) {
+               unlock_buffer(data_bh);
+               error = -EIO;
+               goto out_restore;
+       }
+       memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+       if (!S_ISDIR(inode->i_mode)) {
+               memcpy(data_bh->b_data, buf, inline_size);
+               set_buffer_uptodate(data_bh);
+               error = ext4_handle_dirty_metadata(handle,
+                                                  inode, data_bh);
+       } else {
+               error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+                                                      buf, inline_size);
+       }
+
+       unlock_buffer(data_bh);
+out_restore:
+       if (error)
+               ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+       brelse(data_bh);
+       kfree(buf);
+       return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+                             struct inode *inode)
+{
+       int ret, inline_size;
+       void *inline_start;
+       struct ext4_iloc iloc;
+       struct inode *dir = dentry->d_parent->d_inode;
+
+       ret = ext4_get_inode_loc(dir, &iloc);
+       if (ret)
+               return ret;
+
+       down_write(&EXT4_I(dir)->xattr_sem);
+       if (!ext4_has_inline_data(dir))
+               goto out;
+
+       inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                                EXT4_INLINE_DOTDOT_SIZE;
+       inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+       ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+                                       inline_start, inline_size);
+       if (ret != -ENOSPC)
+               goto out;
+
+       /* check whether it can be inserted to inline xattr space. */
+       inline_size = EXT4_I(dir)->i_inline_size -
+                       EXT4_MIN_INLINE_DATA_SIZE;
+       if (!inline_size) {
+               /* Try to use the xattr space.*/
+               ret = ext4_update_inline_dir(handle, dir, &iloc);
+               if (ret && ret != -ENOSPC)
+                       goto out;
+
+               inline_size = EXT4_I(dir)->i_inline_size -
+                               EXT4_MIN_INLINE_DATA_SIZE;
+       }
+
+       if (inline_size) {
+               inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+               ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+                                               inline_start, inline_size);
+
+               if (ret != -ENOSPC)
+                       goto out;
+       }
+
+       /*
+        * The inline space is filled up, so create a new block for it.
+        * As the extent tree will be created, we have to save the inline
+        * dir first.
+        */
+       ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+       ext4_mark_inode_dirty(handle, dir);
+       up_write(&EXT4_I(dir)->xattr_sem);
+       brelse(iloc.bh);
+       return ret;
+}
+
+int ext4_read_inline_dir(struct file *filp,
+                        void *dirent, filldir_t filldir,
+                        int *has_inline_data)
+{
+       int error = 0;
+       unsigned int offset, parent_ino;
+       int i, stored;
+       struct ext4_dir_entry_2 *de;
+       struct super_block *sb;
+       struct inode *inode = filp->f_path.dentry->d_inode;
+       int ret, inline_size = 0;
+       struct ext4_iloc iloc;
+       void *dir_buf = NULL;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               up_read(&EXT4_I(inode)->xattr_sem);
+               *has_inline_data = 0;
+               goto out;
+       }
+
+       inline_size = ext4_get_inline_size(inode);
+       dir_buf = kmalloc(inline_size, GFP_NOFS);
+       if (!dir_buf) {
+               ret = -ENOMEM;
+               up_read(&EXT4_I(inode)->xattr_sem);
+               goto out;
+       }
+
+       ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+       up_read(&EXT4_I(inode)->xattr_sem);
+       if (ret < 0)
+               goto out;
+
+       sb = inode->i_sb;
+       stored = 0;
+       parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+
+       while (!error && !stored && filp->f_pos < inode->i_size) {
+revalidate:
+               /*
+                * If the version has changed since the last call to
+                * readdir(2), then we might be pointing to an invalid
+                * dirent right now.  Scan from the start of the inline
+                * dir to make sure.
+                */
+               if (filp->f_version != inode->i_version) {
+                       for (i = 0;
+                            i < inode->i_size && i < offset;) {
+                               if (!i) {
+                                       /* skip "." and ".." if needed. */
+                                       i += EXT4_INLINE_DOTDOT_SIZE;
+                                       continue;
+                               }
+                               de = (struct ext4_dir_entry_2 *)
+                                       (dir_buf + i);
+                               /* It's too expensive to do a full
+                                * dirent test each time round this
+                                * loop, but we do have to test at
+                                * least that it is non-zero.  A
+                                * failure will be detected in the
+                                * dirent test below. */
+                               if (ext4_rec_len_from_disk(de->rec_len,
+                                       inline_size) < EXT4_DIR_REC_LEN(1))
+                                       break;
+                               i += ext4_rec_len_from_disk(de->rec_len,
+                                                           inline_size);
+                       }
+                       offset = i;
+                       filp->f_pos = offset;
+                       filp->f_version = inode->i_version;
+               }
+
+               while (!error && filp->f_pos < inode->i_size) {
+                       if (filp->f_pos == 0) {
+                               error = filldir(dirent, ".", 1, 0, inode->i_ino,
+                                               DT_DIR);
+                               if (error)
+                                       break;
+                               stored++;
+
+                               error = filldir(dirent, "..", 2, 0, parent_ino,
+                                               DT_DIR);
+                               if (error)
+                                       break;
+                               stored++;
+
+                               filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+                               continue;
+                       }
+
+                       de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+                       if (ext4_check_dir_entry(inode, filp, de,
+                                                iloc.bh, dir_buf,
+                                                inline_size, offset)) {
+                               ret = stored;
+                               goto out;
+                       }
+                       offset += ext4_rec_len_from_disk(de->rec_len,
+                                                        inline_size);
+                       if (le32_to_cpu(de->inode)) {
+                               /* We might block in the next section
+                                * if the data destination is
+                                * currently swapped out.  So, use a
+                                * version stamp to detect whether or
+                                * not the directory has been modified
+                                * during the copy operation.
+                                */
+                               u64 version = filp->f_version;
+
+                               error = filldir(dirent, de->name,
+                                               de->name_len,
+                                               filp->f_pos,
+                                               le32_to_cpu(de->inode),
+                                               get_dtype(sb, de->file_type));
+                               if (error)
+                                       break;
+                               if (version != filp->f_version)
+                                       goto revalidate;
+                               stored++;
+                       }
+                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                                             inline_size);
+               }
+               offset = 0;
+       }
+out:
+       kfree(dir_buf);
+       brelse(iloc.bh);
+       return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+                                       struct ext4_dir_entry_2 **parent_de,
+                                       int *retval)
+{
+       struct ext4_iloc iloc;
+
+       *retval = ext4_get_inode_loc(inode, &iloc);
+       if (*retval)
+               return NULL;
+
+       *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+       return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+                              struct inode *inode)
+{
+       int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+       struct ext4_iloc iloc;
+       struct ext4_dir_entry_2 *de;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       ret = ext4_prepare_inline_data(handle, inode, inline_size);
+       if (ret)
+               goto out;
+
+       /*
+        * For inline dir, we only save the inode information for the ".."
+        * and create a fake dentry to cover the left space.
+        */
+       de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+       de->inode = cpu_to_le32(parent->i_ino);
+       de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+       de->inode = 0;
+       de->rec_len = ext4_rec_len_to_disk(
+                               inline_size - EXT4_INLINE_DOTDOT_SIZE,
+                               inline_size);
+       set_nlink(inode, 2);
+       inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+       brelse(iloc.bh);
+       return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+                                       const struct qstr *d_name,
+                                       struct ext4_dir_entry_2 **res_dir,
+                                       int *has_inline_data)
+{
+       int ret;
+       struct ext4_iloc iloc;
+       void *inline_start;
+       int inline_size;
+
+       if (ext4_get_inode_loc(dir, &iloc))
+               return NULL;
+
+       down_read(&EXT4_I(dir)->xattr_sem);
+       if (!ext4_has_inline_data(dir)) {
+               *has_inline_data = 0;
+               goto out;
+       }
+
+       inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                               EXT4_INLINE_DOTDOT_SIZE;
+       inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+       ret = search_dir(iloc.bh, inline_start, inline_size,
+                        dir, d_name, 0, res_dir);
+       if (ret == 1)
+               goto out_find;
+       if (ret < 0)
+               goto out;
+
+       if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+               goto out;
+
+       inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+       inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+       ret = search_dir(iloc.bh, inline_start, inline_size,
+                        dir, d_name, 0, res_dir);
+       if (ret == 1)
+               goto out_find;
+
+out:
+       brelse(iloc.bh);
+       iloc.bh = NULL;
+out_find:
+       up_read(&EXT4_I(dir)->xattr_sem);
+       return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+                            struct inode *dir,
+                            struct ext4_dir_entry_2 *de_del,
+                            struct buffer_head *bh,
+                            int *has_inline_data)
+{
+       int err, inline_size;
+       struct ext4_iloc iloc;
+       void *inline_start;
+
+       err = ext4_get_inode_loc(dir, &iloc);
+       if (err)
+               return err;
+
+       down_write(&EXT4_I(dir)->xattr_sem);
+       if (!ext4_has_inline_data(dir)) {
+               *has_inline_data = 0;
+               goto out;
+       }
+
+       if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+               EXT4_MIN_INLINE_DATA_SIZE) {
+               inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                       EXT4_INLINE_DOTDOT_SIZE;
+               inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+                               EXT4_INLINE_DOTDOT_SIZE;
+       } else {
+               inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+               inline_size = ext4_get_inline_size(dir) -
+                               EXT4_MIN_INLINE_DATA_SIZE;
+       }
+
+       err = ext4_journal_get_write_access(handle, bh);
+       if (err)
+               goto out;
+
+       err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+                                       inline_start, inline_size, 0);
+       if (err)
+               goto out;
+
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       err = ext4_mark_inode_dirty(handle, dir);
+       if (unlikely(err))
+               goto out;
+
+       ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+       up_write(&EXT4_I(dir)->xattr_sem);
+       brelse(iloc.bh);
+       if (err != -ENOENT)
+               ext4_std_error(dir->i_sb, err);
+       return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+                     struct ext4_iloc *iloc,
+                     unsigned int offset,
+                     void **inline_start,
+                     int *inline_size)
+{
+       void *inline_pos;
+
+       BUG_ON(offset > ext4_get_inline_size(inode));
+
+       if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+               inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+               *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+       } else {
+               inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+               offset -= EXT4_MIN_INLINE_DATA_SIZE;
+               *inline_size = ext4_get_inline_size(inode) -
+                               EXT4_MIN_INLINE_DATA_SIZE;
+       }
+
+       if (inline_start)
+               *inline_start = inline_pos;
+       return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+       int err, inline_size;
+       struct ext4_iloc iloc;
+       void *inline_pos;
+       unsigned int offset;
+       struct ext4_dir_entry_2 *de;
+       int ret = 1;
+
+       err = ext4_get_inode_loc(dir, &iloc);
+       if (err) {
+               EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+                                err, dir->i_ino);
+               return 1;
+       }
+
+       down_read(&EXT4_I(dir)->xattr_sem);
+       if (!ext4_has_inline_data(dir)) {
+               *has_inline_data = 0;
+               goto out;
+       }
+
+       de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+       if (!le32_to_cpu(de->inode)) {
+               ext4_warning(dir->i_sb,
+                            "bad inline directory (dir #%lu) - no `..'",
+                            dir->i_ino);
+               ret = 1;
+               goto out;
+       }
+
+       offset = EXT4_INLINE_DOTDOT_SIZE;
+       while (offset < dir->i_size) {
+               de = ext4_get_inline_entry(dir, &iloc, offset,
+                                          &inline_pos, &inline_size);
+               if (ext4_check_dir_entry(dir, NULL, de,
+                                        iloc.bh, inline_pos,
+                                        inline_size, offset)) {
+                       ext4_warning(dir->i_sb,
+                                    "bad inline directory (dir #%lu) - "
+                                    "inode %u, rec_len %u, name_len %d"
+                                    "inline size %d\n",
+                                    dir->i_ino, le32_to_cpu(de->inode),
+                                    le16_to_cpu(de->rec_len), de->name_len,
+                                    inline_size);
+                       ret = 1;
+                       goto out;
+               }
+               if (le32_to_cpu(de->inode)) {
+                       ret = 0;
+                       goto out;
+               }
+               offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+       }
+
+out:
+       up_read(&EXT4_I(dir)->xattr_sem);
+       brelse(iloc.bh);
+       return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+       int ret;
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       ret = ext4_destroy_inline_data_nolock(handle, inode);
+       up_write(&EXT4_I(inode)->xattr_sem);
+
+       return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+                           struct fiemap_extent_info *fieinfo,
+                           int *has_inline)
+{
+       __u64 physical = 0;
+       __u64 length;
+       __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+       int error = 0;
+       struct ext4_iloc iloc;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               *has_inline = 0;
+               goto out;
+       }
+
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error)
+               goto out;
+
+       physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+       physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+       physical += offsetof(struct ext4_inode, i_block);
+       length = i_size_read(inode);
+
+       if (physical)
+               error = fiemap_fill_next_extent(fieinfo, 0, physical,
+                                               length, flags);
+       brelse(iloc.bh);
+out:
+       up_read(&EXT4_I(inode)->xattr_sem);
+       return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+                                 struct inode *inode,
+                                 int needed)
+{
+       int error;
+       struct ext4_xattr_entry *entry;
+       struct ext4_xattr_ibody_header *header;
+       struct ext4_inode *raw_inode;
+       struct ext4_iloc iloc;
+
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error)
+               return error;
+
+       raw_inode = ext4_raw_inode(&iloc);
+       header = IHDR(inode, raw_inode);
+       entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                           EXT4_I(inode)->i_inline_off);
+       if (EXT4_XATTR_LEN(entry->e_name_len) +
+           EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+               error = -ENOSPC;
+               goto out;
+       }
+
+       error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+       brelse(iloc.bh);
+       return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+       handle_t *handle;
+       int inline_size, value_len, needed_blocks;
+       size_t i_size;
+       void *value = NULL;
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = -ENODATA, },
+       };
+       struct ext4_xattr_info i = {
+               .name_index = EXT4_XATTR_INDEX_SYSTEM,
+               .name = EXT4_XATTR_SYSTEM_DATA,
+       };
+
+
+       needed_blocks = ext4_writepage_trans_blocks(inode);
+       handle = ext4_journal_start(inode, needed_blocks);
+       if (IS_ERR(handle))
+               return;
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               *has_inline = 0;
+               ext4_journal_stop(handle);
+               return;
+       }
+
+       if (ext4_orphan_add(handle, inode))
+               goto out;
+
+       if (ext4_get_inode_loc(inode, &is.iloc))
+               goto out;
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       i_size = inode->i_size;
+       inline_size = ext4_get_inline_size(inode);
+       EXT4_I(inode)->i_disksize = i_size;
+
+       if (i_size < inline_size) {
+               /* Clear the content in the xattr space. */
+               if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+                       if (ext4_xattr_ibody_find(inode, &i, &is))
+                               goto out_error;
+
+                       BUG_ON(is.s.not_found);
+
+                       value_len = le32_to_cpu(is.s.here->e_value_size);
+                       value = kmalloc(value_len, GFP_NOFS);
+                       if (!value)
+                               goto out_error;
+
+                       if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+                                               value, value_len))
+                               goto out_error;
+
+                       i.value = value;
+                       i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+                                       i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+                       if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+                               goto out_error;
+               }
+
+               /* Clear the content within i_blocks. */
+               if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
+                       memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
+                                       EXT4_MIN_INLINE_DATA_SIZE - i_size);
+
+               EXT4_I(inode)->i_inline_size = i_size <
+                                       EXT4_MIN_INLINE_DATA_SIZE ?
+                                       EXT4_MIN_INLINE_DATA_SIZE : i_size;
+       }
+
+out_error:
+       up_write(&EXT4_I(inode)->i_data_sem);
+out:
+       brelse(is.iloc.bh);
+       up_write(&EXT4_I(inode)->xattr_sem);
+       kfree(value);
+       if (inode->i_nlink)
+               ext4_orphan_del(handle, inode);
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+       return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+       int error, needed_blocks;
+       handle_t *handle;
+       struct ext4_iloc iloc;
+
+       if (!ext4_has_inline_data(inode)) {
+               ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+               return 0;
+       }
+
+       needed_blocks = ext4_writepage_trans_blocks(inode);
+
+       iloc.bh = NULL;
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error)
+               return error;
+
+       handle = ext4_journal_start(inode, needed_blocks);
+       if (IS_ERR(handle)) {
+               error = PTR_ERR(handle);
+               goto out_free;
+       }
+
+       down_write(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               up_write(&EXT4_I(inode)->xattr_sem);
+               goto out;
+       }
+
+       error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+       up_write(&EXT4_I(inode)->xattr_sem);
+out:
+       ext4_journal_stop(handle);
+out_free:
+       brelse(iloc.bh);
+       return error;
+}
index b3c243b9afa527346f9ca456258081083ece0560..cb1c1ab2720bd6c08c641879adb5ac77c11ca104 100644 (file)
@@ -483,49 +483,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
        return num;
 }
 
-/*
- * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
- */
-static void set_buffers_da_mapped(struct inode *inode,
-                                  struct ext4_map_blocks *map)
-{
-       struct address_space *mapping = inode->i_mapping;
-       struct pagevec pvec;
-       int i, nr_pages;
-       pgoff_t index, end;
-
-       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       end = (map->m_lblk + map->m_len - 1) >>
-               (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-       pagevec_init(&pvec, 0);
-       while (index <= end) {
-               nr_pages = pagevec_lookup(&pvec, mapping, index,
-                                         min(end - index + 1,
-                                             (pgoff_t)PAGEVEC_SIZE));
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       struct buffer_head *bh, *head;
-
-                       if (unlikely(page->mapping != mapping) ||
-                           !PageDirty(page))
-                               break;
-
-                       if (page_has_buffers(page)) {
-                               bh = head = page_buffers(page);
-                               do {
-                                       set_buffer_da_mapped(bh);
-                                       bh = bh->b_this_page;
-                               } while (bh != head);
-                       }
-                       index++;
-               }
-               pagevec_release(&pvec);
-       }
-}
-
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                up_read((&EXT4_I(inode)->i_data_sem));
 
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret = check_block_validity(inode, map);
+               int ret;
+               if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+                       /* delayed alloc may be allocated by fallocate and
+                        * coverted to initialized by directIO.
+                        * we need to handle delayed extent here.
+                        */
+                       down_write((&EXT4_I(inode)->i_data_sem));
+                       goto delayed_mapped;
+               }
+               ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
-               /* If we have successfully mapped the delayed allocated blocks,
-                * set the BH_Da_Mapped bit on them. Its important to do this
-                * under the protection of i_data_sem.
-                */
-               if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-                       set_buffers_da_mapped(inode, map);
+               if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+                       int ret;
+delayed_mapped:
+                       /* delayed allocation blocks has been allocated */
+                       ret = ext4_es_remove_extent(inode, map->m_lblk,
+                                                   map->m_len);
+                       if (ret < 0)
+                               retval = ret;
+               }
        }
 
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
        int ret = 0, started = 0;
        int dio_credits;
 
+       if (ext4_has_inline_data(inode))
+               return -ERANGE;
+
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
 
-       if (flags && !handle) {
+       if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
                /* Direct IO write... */
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
@@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
        return NULL;
 }
 
-static int walk_page_buffers(handle_t *handle,
-                            struct buffer_head *head,
-                            unsigned from,
-                            unsigned to,
-                            int *partial,
-                            int (*fn)(handle_t *handle,
-                                      struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+                          struct buffer_head *head,
+                          unsigned from,
+                          unsigned to,
+                          int *partial,
+                          int (*fn)(handle_t *handle,
+                                    struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
-static int do_journal_get_write_access(handle_t *handle,
-                                      struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+                               struct buffer_head *bh)
 {
        int dirty = buffer_dirty(bh);
        int ret;
@@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
        return ret;
 }
 
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
@@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
 
+       if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+               ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+                                                   flags, pagep);
+               if (ret < 0)
+                       goto out;
+               if (ret == 1) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
 retry:
        handle = ext4_journal_start(inode, needed_blocks);
        if (IS_ERR(handle)) {
@@ -919,6 +902,7 @@ retry:
                ret = -ENOMEM;
                goto out;
        }
+
        *pagep = page;
 
        if (ext4_should_dioread_nolock(inode))
@@ -927,8 +911,9 @@ retry:
                ret = __block_write_begin(page, pos, len, ext4_get_block);
 
        if (!ret && ext4_should_journal_data(inode)) {
-               ret = walk_page_buffers(handle, page_buffers(page),
-                               from, to, NULL, do_journal_get_write_access);
+               ret = ext4_walk_page_buffers(handle, page_buffers(page),
+                                            from, to, NULL,
+                                            do_journal_get_write_access);
        }
 
        if (ret) {
@@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
        struct inode *inode = mapping->host;
        handle_t *handle = ext4_journal_current_handle();
 
-       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+       if (ext4_has_inline_data(inode))
+               copied = ext4_write_inline_data_end(inode, pos, len,
+                                                   copied, page);
+       else
+               copied = block_write_end(file, mapping, pos,
+                                        len, copied, page, fsdata);
 
        /*
         * No need to use i_size_read() here, the i_size
@@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
 
        BUG_ON(!ext4_handle_valid(handle));
 
-       if (copied < len) {
-               if (!PageUptodate(page))
-                       copied = 0;
-               page_zero_new_buffers(page, from+copied, to);
-       }
+       if (ext4_has_inline_data(inode))
+               copied = ext4_write_inline_data_end(inode, pos, len,
+                                                   copied, page);
+       else {
+               if (copied < len) {
+                       if (!PageUptodate(page))
+                               copied = 0;
+                       page_zero_new_buffers(page, from+copied, to);
+               }
 
-       ret = walk_page_buffers(handle, page_buffers(page), from,
-                               to, &partial, write_end_fn);
-       if (!partial)
-               SetPageUptodate(page);
+               ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+                                            to, &partial, write_end_fn);
+               if (!partial)
+                       SetPageUptodate(page);
+       }
        new_i_size = pos + copied;
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
@@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
        struct inode *inode = page->mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int num_clusters;
+       ext4_fsblk_t lblk;
 
        head = page_buffers(page);
        bh = head;
@@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
-                       clear_buffer_da_mapped(bh);
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
 
+       if (to_release) {
+               lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+               ext4_es_remove_extent(inode, lblk, to_release);
+       }
+
        /* If we have released all the blocks belonging to a cluster, then we
         * need to release the reserved space for that cluster. */
        num_clusters = EXT4_NUM_B2C(sbi, to_release);
        while (num_clusters > 0) {
-               ext4_fsblk_t lblk;
                lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
                        ((num_clusters - 1) << sbi->s_cluster_bits);
                if (sbi->s_cluster_ratio == 1 ||
-                   !ext4_find_delalloc_cluster(inode, lblk, 1))
+                   !ext4_find_delalloc_cluster(inode, lblk))
                        ext4_da_release_space(inode, 1);
 
                num_clusters--;
@@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
                                        }
-                                       if (buffer_da_mapped(bh))
-                                               clear_buffer_da_mapped(bh);
                                        if (buffer_unwritten(bh) ||
                                            buffer_mapped(bh))
                                                BUG_ON(bh->b_blocknr != pblock);
@@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+       ext4_lblk_t start, last;
 
        index = mpd->first_page;
        end   = mpd->next_page - 1;
+
+       start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       ext4_es_remove_extent(inode, start, last - start + 1);
+
+       pagevec_init(&pvec, 0);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 
                for (i = 0; i < map.m_len; i++)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-
-               if (ext4_should_order_data(mpd->inode)) {
-                       err = ext4_jbd2_file_inode(handle, mpd->inode);
-                       if (err) {
-                               /* Only if the journal is aborted */
-                               mpd->retval = err;
-                               goto submit_io;
-                       }
-               }
        }
 
        /*
@@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+       if (ext4_has_inline_data(inode)) {
+               /*
+                * We will soon create blocks for this page, and let
+                * us pretend as if the blocks aren't allocated yet.
+                * In case of clusters, we have to handle the work
+                * of mapping from cluster so that the reserved space
+                * is calculated properly.
+                */
+               if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+                   ext4_find_delalloc_cluster(inode, map->m_lblk))
+                       map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+               retval = 0;
+       } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                                goto out_unlock;
                }
 
+               retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+               if (retval)
+                       goto out_unlock;
+
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
                 */
@@ -1842,8 +1853,8 @@ out_unlock:
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
  */
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                 struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                          struct buffer_head *bh, int create)
 {
        struct ext4_map_blocks map;
        int ret = 0;
@@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
 {
        struct address_space *mapping = page->mapping;
        struct inode *inode = mapping->host;
-       struct buffer_head *page_bufs;
+       struct buffer_head *page_bufs = NULL;
        handle_t *handle = NULL;
-       int ret = 0;
-       int err;
+       int ret = 0, err = 0;
+       int inline_data = ext4_has_inline_data(inode);
+       struct buffer_head *inode_bh = NULL;
 
        ClearPageChecked(page);
-       page_bufs = page_buffers(page);
-       BUG_ON(!page_bufs);
-       walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+       if (inline_data) {
+               BUG_ON(page->index != 0);
+               BUG_ON(len > ext4_get_max_inline_size(inode));
+               inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+               if (inode_bh == NULL)
+                       goto out;
+       } else {
+               page_bufs = page_buffers(page);
+               if (!page_bufs) {
+                       BUG();
+                       goto out;
+               }
+               ext4_walk_page_buffers(handle, page_bufs, 0, len,
+                                      NULL, bget_one);
+       }
        /* As soon as we unlock the page, it can go away, but we have
         * references to buffers so we are safe */
        unlock_page(page);
@@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
 
        BUG_ON(!ext4_handle_valid(handle));
 
-       ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
-                               do_journal_get_write_access);
+       if (inline_data) {
+               ret = ext4_journal_get_write_access(handle, inode_bh);
+
+               err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
 
-       err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
-                               write_end_fn);
+       } else {
+               ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                            do_journal_get_write_access);
+
+               err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                            write_end_fn);
+       }
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page,
        if (!ret)
                ret = err;
 
-       walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+       if (!ext4_has_inline_data(inode))
+               ext4_walk_page_buffers(handle, page_bufs, 0, len,
+                                      NULL, bput_one);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
+       brelse(inode_bh);
        return ret;
 }
 
@@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page,
                commit_write = 1;
        }
        page_bufs = page_buffers(page);
-       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                             ext4_bh_delay_or_unwritten)) {
+       if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                  ext4_bh_delay_or_unwritten)) {
                /*
                 * We don't want to do block allocation, so redirty
                 * the page and return.  We may reach here when we do
@@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  * mpage_da_map_and_submit to map a single contiguous memory region
  * and then write them.
  */
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+                               struct address_space *mapping,
                                struct writeback_control *wbc,
                                struct mpage_da_data *mpd,
                                pgoff_t *done_index)
@@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
 
+                       /*
+                        * If we have inline data and arrive here, it means that
+                        * we will soon create the block for the 1st page, so
+                        * we'd better clear the inline data here.
+                        */
+                       if (ext4_has_inline_data(inode)) {
+                               BUG_ON(ext4_test_inode_state(inode,
+                                               EXT4_STATE_MAY_INLINE_DATA));
+                               ext4_destroy_inline_data(handle, inode);
+                       }
+
                        if (mpd->next_page != page->index)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
@@ -2381,7 +2428,8 @@ retry:
                 * contiguous region of logical blocks that need
                 * blocks to be allocated by ext4 and submit them.
                 */
-               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+               ret = write_cache_pages_da(handle, mapping,
+                                          wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -2445,7 +2493,6 @@ out_writepages:
        return ret;
 }
 
-#define FALL_BACK_TO_NONDELALLOC 1
 static int ext4_nonda_switch(struct super_block *sb)
 {
        s64 free_blocks, dirty_blocks;
@@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len, flags);
+
+       if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+               ret = ext4_da_write_inline_data_begin(mapping, inode,
+                                                     pos, len, flags,
+                                                     pagep, fsdata);
+               if (ret < 0)
+                       goto out;
+               if (ret == 1) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
         * changes.  So let's piggyback the i_disksize mark_inode_dirty
         * into that.
         */
-
        new_i_size = pos + copied;
        if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
-               if (ext4_da_should_update_i_disksize(page, end)) {
+               if (ext4_has_inline_data(inode) ||
+                   ext4_da_should_update_i_disksize(page, end)) {
                        down_write(&EXT4_I(inode)->i_data_sem);
-                       if (new_i_size > EXT4_I(inode)->i_disksize) {
-                               /*
-                                * Updating i_disksize when extending file
-                                * without needing block allocation
-                                */
-                               if (ext4_should_order_data(inode))
-                                       ret = ext4_jbd2_file_inode(handle,
-                                                                  inode);
-
+                       if (new_i_size > EXT4_I(inode)->i_disksize)
                                EXT4_I(inode)->i_disksize = new_i_size;
-                       }
                        up_write(&EXT4_I(inode)->i_data_sem);
                        /* We need to mark inode dirty even if
                         * new_i_size is less that inode->i_size
@@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
                        ext4_mark_inode_dirty(handle, inode);
                }
        }
-       ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+       if (write_mode != CONVERT_INLINE_DATA &&
+           ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+           ext4_has_inline_data(inode))
+               ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+                                                    page);
+       else
+               ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
+
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
 
+       /*
+        * We can get here for an inline file via the FIBMAP ioctl
+        */
+       if (ext4_has_inline_data(inode))
+               return 0;
+
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
                        test_opt(inode->i_sb, DELALLOC)) {
                /*
@@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 
 static int ext4_readpage(struct file *file, struct page *page)
 {
+       int ret = -EAGAIN;
+       struct inode *inode = page->mapping->host;
+
        trace_ext4_readpage(page);
-       return mpage_readpage(page, ext4_get_block);
+
+       if (ext4_has_inline_data(inode))
+               ret = ext4_readpage_inline(inode, page);
+
+       if (ret == -EAGAIN)
+               return mpage_readpage(page, ext4_get_block);
+
+       return ret;
 }
 
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
+       struct inode *inode = mapping->host;
+
+       /* If the file has inline data, no need to do readpages. */
+       if (ext4_has_inline_data(inode))
+               return 0;
+
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
@@ -2840,7 +2921,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * We allocate an uinitialized extent if blocks haven't been allocated.
  * The extent will be converted to initialized after the IO is complete.
  */
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2850,29 +2931,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int flags)
+                  struct buffer_head *bh_result, int create)
 {
-       handle_t *handle = ext4_journal_current_handle();
-       struct ext4_map_blocks map;
-       int ret = 0;
-
-       ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
-                  inode->i_ino, flags);
-
-       flags = EXT4_GET_BLOCKS_NO_LOCK;
-
-       map.m_lblk = iblock;
-       map.m_len = bh_result->b_size >> inode->i_blkbits;
-
-       ret = ext4_map_blocks(handle, inode, &map, flags);
-       if (ret > 0) {
-               map_bh(bh_result, inode->i_sb, map.m_pblk);
-               bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
-                                       map.m_flags;
-               bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
-               ret = 0;
-       }
-       return ret;
+       ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+                  inode->i_ino, create);
+       return _ext4_get_block(inode, iblock, bh_result,
+                              EXT4_GET_BLOCKS_NO_LOCK);
 }
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -2978,10 +3042,10 @@ retry:
  * fall back to buffered IO.
  *
  * For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are splited, but
+ * If those blocks were preallocated, we mark sure they are split, but
  * still keep the range to write as uninitialized.
  *
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
  * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
@@ -2999,125 +3063,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
        size_t count = iov_length(iov, nr_segs);
-
+       int overwrite = 0;
+       get_block_t *get_block_func = NULL;
+       int dio_flags = 0;
        loff_t final_size = offset + count;
-       if (rw == WRITE && final_size <= inode->i_size) {
-               int overwrite = 0;
 
-               BUG_ON(iocb->private == NULL);
+       /* Use the old path for reads and writes beyond i_size. */
+       if (rw != WRITE || final_size > inode->i_size)
+               return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 
-               /* If we do a overwrite dio, i_mutex locking can be released */
-               overwrite = *((int *)iocb->private);
+       BUG_ON(iocb->private == NULL);
 
-               if (overwrite) {
-                       atomic_inc(&inode->i_dio_count);
-                       down_read(&EXT4_I(inode)->i_data_sem);
-                       mutex_unlock(&inode->i_mutex);
-               }
+       /* If we do a overwrite dio, i_mutex locking can be released */
+       overwrite = *((int *)iocb->private);
 
-               /*
-                * We could direct write to holes and fallocate.
-                *
-                * Allocated blocks to fill the hole are marked as uninitialized
-                * to prevent parallel buffered read to expose the stale data
-                * before DIO complete the data IO.
-                *
-                * As to previously fallocated extents, ext4 get_block
-                * will just simply mark the buffer mapped but still
-                * keep the extents uninitialized.
-                *
-                * for non AIO case, we will convert those unwritten extents
-                * to written after return back from blockdev_direct_IO.
-                *
-                * for async DIO, the conversion needs to be defered when
-                * the IO is completed. The ext4 end_io callback function
-                * will be called to take care of the conversion work.
-                * Here for async case, we allocate an io_end structure to
-                * hook to the iocb.
-                */
-               iocb->private = NULL;
-               ext4_inode_aio_set(inode, NULL);
-               if (!is_sync_kiocb(iocb)) {
-                       ext4_io_end_t *io_end =
-                               ext4_init_io_end(inode, GFP_NOFS);
-                       if (!io_end) {
-                               ret = -ENOMEM;
-                               goto retake_lock;
-                       }
-                       io_end->flag |= EXT4_IO_END_DIRECT;
-                       iocb->private = io_end;
-                       /*
-                        * we save the io structure for current async
-                        * direct IO, so that later ext4_map_blocks()
-                        * could flag the io structure whether there
-                        * is a unwritten extents needs to be converted
-                        * when IO is completed.
-                        */
-                       ext4_inode_aio_set(inode, io_end);
-               }
+       if (overwrite) {
+               atomic_inc(&inode->i_dio_count);
+               down_read(&EXT4_I(inode)->i_data_sem);
+               mutex_unlock(&inode->i_mutex);
+       }
 
-               if (overwrite)
-                       ret = __blockdev_direct_IO(rw, iocb, inode,
-                                                inode->i_sb->s_bdev, iov,
-                                                offset, nr_segs,
-                                                ext4_get_block_write_nolock,
-                                                ext4_end_io_dio,
-                                                NULL,
-                                                0);
-               else
-                       ret = __blockdev_direct_IO(rw, iocb, inode,
-                                                inode->i_sb->s_bdev, iov,
-                                                offset, nr_segs,
-                                                ext4_get_block_write,
-                                                ext4_end_io_dio,
-                                                NULL,
-                                                DIO_LOCKING);
-               if (iocb->private)
-                       ext4_inode_aio_set(inode, NULL);
+       /*
+        * We could direct write to holes and fallocate.
+        *
+        * Allocated blocks to fill the hole are marked as
+        * uninitialized to prevent parallel buffered read to expose
+        * the stale data before DIO complete the data IO.
+        *
+        * As to previously fallocated extents, ext4 get_block will
+        * just simply mark the buffer mapped but still keep the
+        * extents uninitialized.
+        *
+        * For non AIO case, we will convert those unwritten extents
+        * to written after return back from blockdev_direct_IO.
+        *
+        * For async DIO, the conversion needs to be deferred when the
+        * IO is completed. The ext4 end_io callback function will be
+        * called to take care of the conversion work.  Here for async
+        * case, we allocate an io_end structure to hook to the iocb.
+        */
+       iocb->private = NULL;
+       ext4_inode_aio_set(inode, NULL);
+       if (!is_sync_kiocb(iocb)) {
+               ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+               if (!io_end) {
+                       ret = -ENOMEM;
+                       goto retake_lock;
+               }
+               io_end->flag |= EXT4_IO_END_DIRECT;
+               iocb->private = io_end;
                /*
-                * The io_end structure takes a reference to the inode,
-                * that structure needs to be destroyed and the
-                * reference to the inode need to be dropped, when IO is
-                * complete, even with 0 byte write, or failed.
-                *
-                * In the successful AIO DIO case, the io_end structure will be
-                * desctroyed and the reference to the inode will be dropped
-                * after the end_io call back function is called.
-                *
-                * In the case there is 0 byte write, or error case, since
-                * VFS direct IO won't invoke the end_io call back function,
-                * we need to free the end_io structure here.
+                * we save the io structure for current async direct
+                * IO, so that later ext4_map_blocks() could flag the
+                * io structure whether there is a unwritten extents
+                * needs to be converted when IO is completed.
                 */
-               if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-                       ext4_free_io_end(iocb->private);
-                       iocb->private = NULL;
-               } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                               EXT4_STATE_DIO_UNWRITTEN)) {
-                       int err;
-                       /*
-                        * for non AIO case, since the IO is already
-                        * completed, we could do the conversion right here
-                        */
-                       err = ext4_convert_unwritten_extents(inode,
-                                                            offset, ret);
-                       if (err < 0)
-                               ret = err;
-                       ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-               }
+               ext4_inode_aio_set(inode, io_end);
+       }
 
-       retake_lock:
-               /* take i_mutex locking again if we do a ovewrite dio */
-               if (overwrite) {
-                       inode_dio_done(inode);
-                       up_read(&EXT4_I(inode)->i_data_sem);
-                       mutex_lock(&inode->i_mutex);
-               }
+       if (overwrite) {
+               get_block_func = ext4_get_block_write_nolock;
+       } else {
+               get_block_func = ext4_get_block_write;
+               dio_flags = DIO_LOCKING;
+       }
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                                  inode->i_sb->s_bdev, iov,
+                                  offset, nr_segs,
+                                  get_block_func,
+                                  ext4_end_io_dio,
+                                  NULL,
+                                  dio_flags);
+
+       if (iocb->private)
+               ext4_inode_aio_set(inode, NULL);
+       /*
+        * The io_end structure takes a reference to the inode, that
+        * structure needs to be destroyed and the reference to the
+        * inode need to be dropped, when IO is complete, even with 0
+        * byte write, or failed.
+        *
+        * In the successful AIO DIO case, the io_end structure will
+        * be destroyed and the reference to the inode will be dropped
+        * after the end_io call back function is called.
+        *
+        * In the case there is 0 byte write, or error case, since VFS
+        * direct IO won't invoke the end_io call back function, we
+        * need to free the end_io structure here.
+        */
+       if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+               ext4_free_io_end(iocb->private);
+               iocb->private = NULL;
+       } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+                                               EXT4_STATE_DIO_UNWRITTEN)) {
+               int err;
+               /*
+                * for non AIO case, since the IO is already
+                * completed, we could do the conversion right here
+                */
+               err = ext4_convert_unwritten_extents(inode,
+                                                    offset, ret);
+               if (err < 0)
+                       ret = err;
+               ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+       }
 
-               return ret;
+retake_lock:
+       /* take i_mutex locking again if we do a ovewrite dio */
+       if (overwrite) {
+               inode_dio_done(inode);
+               up_read(&EXT4_I(inode)->i_data_sem);
+               mutex_lock(&inode->i_mutex);
        }
 
-       /* for write the the end of file case, we fall back to old way */
-       return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+       return ret;
 }
 
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3134,6 +3193,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        if (ext4_should_journal_data(inode))
                return 0;
 
+       /* Let buffer I/O handle the inline data case. */
+       if (ext4_has_inline_data(inode))
+               return 0;
+
        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3531,6 +3594,14 @@ void ext4_truncate(struct inode *inode)
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
+       if (ext4_has_inline_data(inode)) {
+               int has_inline = 1;
+
+               ext4_inline_data_truncate(inode, &has_inline);
+               if (has_inline)
+                       return;
+       }
+
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_ext_truncate(inode);
        else
@@ -3756,6 +3827,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
        }
 }
 
+static inline void ext4_iget_extra_inode(struct inode *inode,
+                                        struct ext4_inode *raw_inode,
+                                        struct ext4_inode_info *ei)
+{
+       __le32 *magic = (void *)raw_inode +
+                       EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+       if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+               ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+               ext4_find_inline_data_nolock(inode);
+       } else
+               EXT4_I(inode)->i_inline_off = 0;
+}
+
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
        struct ext4_iloc iloc;
@@ -3826,6 +3910,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
+       ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -3898,11 +3983,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
-                       __le32 *magic = (void *)raw_inode +
-                                       EXT4_GOOD_OLD_INODE_SIZE +
-                                       ei->i_extra_isize;
-                       if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                               ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+                       ext4_iget_extra_inode(inode, raw_inode, ei);
                }
        }
 
@@ -3925,17 +4006,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
-       } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-               if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                   (S_ISLNK(inode->i_mode) &&
-                    !ext4_inode_is_fast_symlink(inode)))
-                       /* Validate extent which is part of inode */
-                       ret = ext4_ext_check_inode(inode);
-       } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                  (S_ISLNK(inode->i_mode) &&
-                   !ext4_inode_is_fast_symlink(inode))) {
-               /* Validate block references which are part of inode */
-               ret = ext4_ind_check_inode(inode);
+       } else if (!ext4_has_inline_data(inode)) {
+               if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                       if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                           (S_ISLNK(inode->i_mode) &&
+                            !ext4_inode_is_fast_symlink(inode))))
+                               /* Validate extent which is part of inode */
+                               ret = ext4_ext_check_inode(inode);
+               } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                          (S_ISLNK(inode->i_mode) &&
+                           !ext4_inode_is_fast_symlink(inode))) {
+                       /* Validate block references which are part of inode */
+                       ret = ext4_ind_check_inode(inode);
+               }
        }
        if (ret)
                goto bad_inode;
@@ -4122,9 +4205,10 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
-       } else
+       } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
+       }
 
        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
        if (ei->i_extra_isize) {
@@ -4811,8 +4895,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         * journal_start/journal_stop which can block and take a long time
         */
        if (page_has_buffers(page)) {
-               if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                       ext4_bh_unmapped)) {
+               if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+                                           0, len, NULL,
+                                           ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        wait_on_page_writeback(page);
                        ret = VM_FAULT_LOCKED;
@@ -4833,7 +4918,7 @@ retry_alloc:
        }
        ret = __block_page_mkwrite(vma, vmf, get_block);
        if (!ret && ext4_should_journal_data(inode)) {
-               if (walk_page_buffers(handle, page_buffers(page), 0,
+               if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
                          PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
                        unlock_page(page);
                        ret = VM_FAULT_SIGBUS;
index 526e55358606c83a548657449907cd94eed3c155..1bf6fe785c4fbdc0aa4b857f88fba845c9e814d8 100644 (file)
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
        ex->fe_start += next;
 
        while (needed > ex->fe_len &&
-              (buddy = mb_find_buddy(e4b, order, &max))) {
+              mb_find_buddy(e4b, order, &max)) {
 
                if (block + 1 >= max)
                        break;
@@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                 entry->efd_count, entry->efd_group, entry);
 
-       if (test_opt(sb, DISCARD))
-               ext4_issue_discard(sb, entry->efd_group,
-                                  entry->efd_start_cluster, entry->efd_count);
+       if (test_opt(sb, DISCARD)) {
+               err = ext4_issue_discard(sb, entry->efd_group,
+                                        entry->efd_start_cluster,
+                                        entry->efd_count);
+               if (err && err != -EOPNOTSUPP)
+                       ext4_msg(sb, KERN_WARNING, "discard request in"
+                                " group:%d block:%d count:%d failed"
+                                " with %d", entry->efd_group,
+                                entry->efd_start_cluster,
+                                entry->efd_count, err);
+       }
 
        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
        /* we expect to find existing buddy because it's pinned */
@@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
-               if (*errp)
+               if (*errp) {
+                       ext4_discard_allocated_blocks(ac);
                        goto errout;
+               }
 
                /* as we've just preallocated more space than
                 * user requested orinally, we store allocated
@@ -4333,10 +4343,10 @@ repeat:
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
-               } else if (*errp)
-               errout:
+               } else if (*errp) {
                        ext4_discard_allocated_blocks(ac);
-               else {
+                       goto errout;
+               } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
@@ -4347,6 +4357,7 @@ repeat:
                *errp = -ENOSPC;
        }
 
+errout:
        if (*errp) {
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
@@ -4656,8 +4667,16 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-               if (test_opt(sb, DISCARD))
-                       ext4_issue_discard(sb, block_group, bit, count);
+               if (test_opt(sb, DISCARD)) {
+                       err = ext4_issue_discard(sb, block_group, bit, count);
+                       if (err && err != -EOPNOTSUPP)
+                               ext4_msg(sb, KERN_WARNING, "discard request in"
+                                        " group:%d block:%d count:%lu failed"
+                                        " with %d", block_group, bit, count,
+                                        err);
+               }
+
+
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4851,10 +4870,11 @@ error_return:
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
                             ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
+       int ret = 0;
 
        trace_ext4_trim_extent(sb, group, start, count);
 
@@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
-       ext4_issue_discard(sb, group, start, count);
+       ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+       return ret;
 }
 
 /**
@@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
        void *bitmap;
        ext4_grpblk_t next, count = 0, free_count = 0;
        struct ext4_buddy e4b;
-       int ret;
+       int ret = 0;
 
        trace_ext4_trim_all_free(sb, group, start, max);
 
@@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                next = mb_find_next_bit(bitmap, max + 1, start);
 
                if ((next - start) >= minblocks) {
-                       ext4_trim_extent(sb, start,
-                                        next - start, group, &e4b);
+                       ret = ext4_trim_extent(sb, start,
+                                              next - start, group, &e4b);
+                       if (ret && ret != -EOPNOTSUPP)
+                               break;
+                       ret = 0;
                        count += next - start;
                }
                free_count += next - start;
@@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                        break;
        }
 
-       if (!ret)
+       if (!ret) {
+               ret = count;
                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+       }
 out:
        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
@@ -4959,7 +4985,7 @@ out:
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
 
-       return count;
+       return ret;
 }
 
 /**
index f1bb32ec01697b531df4db62bd02379fd746ae79..db8226d595faf9ec88e2db908aace1c95098cb1c 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
index 292daeeed4557b5a978159f4d2217d1aded028c0..d9cc5ee42f534c9d1dcb87450493d93fbe5d320b 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 
 /**
  * get_ext_path - Find an extent path for designated logical block number.
index 6d600a69fc9dedcfa92b45064cfcb85e79260946..cac44828233159bfeb1d93c502943fe82b813b2a 100644 (file)
@@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 
 /* checksumming functions */
-#define EXT4_DIRENT_TAIL(block, blocksize) \
-       ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
-                                       ((blocksize) - \
-                                        sizeof(struct ext4_dir_entry_tail))))
-
-static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-                                  unsigned int blocksize)
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+                           unsigned int blocksize)
 {
        memset(t, 0, sizeof(struct ext4_dir_entry_tail));
        t->det_rec_len = ext4_rec_len_to_disk(
@@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
        return cpu_to_le32(csum);
 }
 
+static void warn_no_space_for_csum(struct inode *inode)
+{
+       ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+                    "checksum.  Please run e2fsck -D.", inode->i_ino);
+}
+
 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 {
        struct ext4_dir_entry_tail *t;
@@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 
        t = get_dirent_tail(inode, dirent);
        if (!t) {
-               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
-                                "leaf for checksum.  Please run e2fsck -D.");
+               warn_no_space_for_csum(inode);
                return 0;
        }
 
@@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
 
        t = get_dirent_tail(inode, dirent);
        if (!t) {
-               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
-                                "leaf for checksum.  Please run e2fsck -D.");
+               warn_no_space_for_csum(inode);
                return;
        }
 
@@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
                                           (void *)t - (void *)dirent);
 }
 
-static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
-                                               struct inode *inode,
-                                               struct buffer_head *bh)
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+                                 struct inode *inode,
+                                 struct buffer_head *bh)
 {
        ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
        return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
-                                "tree checksum found.  Run e2fsck -D.");
+               warn_no_space_for_csum(inode);
                return 1;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
-                                "tree checksum.  Run e2fsck -D.");
+               warn_no_space_for_csum(inode);
                return;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                               bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
@@ -1007,6 +1005,15 @@ errout:
        return (err);
 }
 
+static inline int search_dirblock(struct buffer_head *bh,
+                                 struct inode *dir,
+                                 const struct qstr *d_name,
+                                 unsigned int offset,
+                                 struct ext4_dir_entry_2 **res_dir)
+{
+       return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+                         d_name, offset, res_dir);
+}
 
 /*
  * Directory block splitting, compacting
@@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
        dx_set_count(entries, count + 1);
 }
 
-static void ext4_update_dx_flag(struct inode *inode)
-{
-       if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-                                    EXT4_FEATURE_COMPAT_DIR_INDEX))
-               ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
-
 /*
  * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
  *
@@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static inline int search_dirblock(struct buffer_head *bh,
-                                 struct inode *dir,
-                                 const struct qstr *d_name,
-                                 unsigned int offset,
-                                 struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+              char *search_buf,
+              int buf_size,
+              struct inode *dir,
+              const struct qstr *d_name,
+              unsigned int offset,
+              struct ext4_dir_entry_2 **res_dir)
 {
        struct ext4_dir_entry_2 * de;
        char * dlimit;
@@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
        const char *name = d_name->name;
        int namelen = d_name->len;
 
-       de = (struct ext4_dir_entry_2 *) bh->b_data;
-       dlimit = bh->b_data + dir->i_sb->s_blocksize;
+       de = (struct ext4_dir_entry_2 *)search_buf;
+       dlimit = search_buf + buf_size;
        while ((char *) de < dlimit) {
                /* this code is executed quadratically often */
                /* do minimal checking `by hand' */
@@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                       if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+                       if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+                                                bh->b_size, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
        return 0;
 }
 
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+                              struct ext4_dir_entry *de)
+{
+       struct super_block *sb = dir->i_sb;
+
+       if (!is_dx(dir))
+               return 0;
+       if (block == 0)
+               return 1;
+       if (de->inode == 0 &&
+           ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+                       sb->s_blocksize)
+               return 1;
+       return 0;
+}
 
 /*
  *     ext4_find_entry()
@@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
  */
 static struct buffer_head * ext4_find_entry (struct inode *dir,
                                        const struct qstr *d_name,