static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- if (!fmt)
- return -EINVAL;
+ BUG_ON(!fmt);
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
- return 0;
}
EXPORT_SYMBOL(__register_binfmt);
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- sync_mm_rss(tsk, old_mm);
+ sync_mm_rss(old_mm);
mm_release(tsk, old_mm);
if (old_mm) {
bprm->mm = NULL; /* We're using it now */
set_fs(USER_DS);
- current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+ current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
flush_thread();
current->personality &= ~bprm->per_clear;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
+ struct hugetlbfs_config {
+ uid_t uid;
+ gid_t gid;
+ umode_t mode;
+ long nr_blocks;
+ long nr_inodes;
+ struct hstate *hstate;
+ };
+
+ struct hugetlbfs_inode_info {
+ struct shared_policy policy;
+ struct inode vfs_inode;
+ };
+
+ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+ {
+ return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+ }
+
static struct backing_dev_info hugetlbfs_backing_dev_info = {
.name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
return addr;
}
- start_addr = mm->free_area_cache;
-
- if (len <= mm->cached_hole_size)
+ if (len > mm->cached_hole_size)
+ start_addr = mm->free_area_cache;
+ else {
start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
full_search:
addr = ALIGN(start_addr, huge_page_size(h));
*/
if (start_addr != TASK_UNMAPPED_BASE) {
start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
- if (!vma || addr + len <= vma->vm_start)
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
return addr;
+ }
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
addr = ALIGN(vma->vm_end, huge_page_size(h));
}
}
loff_t isize;
ssize_t retval = 0;
- mutex_lock(&inode->i_mutex);
-
/* validate length */
if (len == 0)
goto out;
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> huge_page_shift(h);
for (;;) {
struct page *page;
unsigned long nr, ret;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+ end_index = (isize - 1) >> huge_page_shift(h);
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
- if (nr <= offset) {
+ if (nr <= offset)
goto out;
- }
}
nr = nr - offset;
/* Find the page */
- page = find_get_page(mapping, index);
+ page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
/*
* We have a HOLE, zero out the user-buffer for the
else
ra = 0;
} else {
+ unlock_page(page);
+
/*
* We have the page, copy it to user space buffer.
*/
ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
ret = ra;
+ page_cache_release(page);
}
if (ra < 0) {
if (retval == 0)
retval = ra;
- if (page)
- page_cache_release(page);
goto out;
}
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
- if (page)
- page_cache_release(page);
-
/* short read or no more work */
if ((ret != nr) || (len == 0))
break;
}
out:
*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
- mutex_unlock(&inode->i_mutex);
return retval;
}
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
- if (sbinfo->max_blocks >= 0) {
- buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ if (sbinfo->spool) {
+ long free_pages;
+
+ spin_lock(&sbinfo->spool->lock);
+ buf->f_blocks = sbinfo->spool->max_hpages;
+ free_pages = sbinfo->spool->max_hpages
+ - sbinfo->spool->used_hpages;
+ buf->f_bavail = buf->f_bfree = free_pages;
+ spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
if (sbi) {
sb->s_fs_info = NULL;
+
+ if (sbi->spool)
+ hugepage_put_subpool(sbi->spool);
+
kfree(sbi);
}
}
static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
- struct inode * inode;
- struct dentry * root;
int ret;
struct hugetlbfs_config config;
struct hugetlbfs_sb_info *sbinfo;
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = config.nr_blocks;
- sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
+ sbinfo->spool = NULL;
+ if (config.nr_blocks != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ if (!sbinfo->spool)
+ goto out_free;
+ }
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- inode = hugetlbfs_get_root(sb, &config);
- if (!inode)
+ sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+ if (!sb->s_root)
goto out_free;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
- goto out_free;
- }
- sb->s_root = root;
return 0;
out_free:
+ if (sbinfo->spool)
+ kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
- int hugetlb_get_quota(struct address_space *mapping, long delta)
- {
- int ret = 0;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks - delta >= 0)
- sbinfo->free_blocks -= delta;
- else
- ret = -ENOMEM;
- spin_unlock(&sbinfo->stat_lock);
- }
-
- return ret;
- }
-
- void hugetlb_put_quota(struct address_space *mapping, long delta)
- {
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
- if (sbinfo->free_blocks > -1) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += delta;
- spin_unlock(&sbinfo->stat_lock);
- }
- }
-
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
- struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag,
+ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+ size_t size, vm_flags_t acctflag,
struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
struct path path;
struct dentry *root;
struct qstr quick_string;
+ struct hstate *hstate;
+ unsigned long num_pages;
*user = NULL;
if (!hugetlbfs_vfsmount)
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
- printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
+ task_lock(current);
+ printk_once(KERN_WARNING
+ "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ current->comm, current->pid);
+ task_unlock(current);
} else {
*user = NULL;
return ERR_PTR(-EPERM);
if (!inode)
goto out_dentry;
+ hstate = hstate_inode(inode);
+ size += addr & ~huge_page_mask(hstate);
+ num_pages = ALIGN(size, huge_page_size(hstate)) >>
+ huge_page_shift(hstate);
error = -ENOMEM;
- if (hugetlb_reserve_pages(inode, 0,
- size >> huge_page_shift(hstate_inode(inode)), NULL,
- acctflag))
+ if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
if (error)
return error;
+ error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
0, 0, init_once);
}
error = PTR_ERR(vfsmount);
+ unregister_filesystem(&hugetlbfs_fs_type);
out:
- if (error)
- kmem_cache_destroy(hugetlbfs_inode_cachep);
+ kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
cond_resched();
current->total_link_count++;
- touch_atime(link->mnt, dentry);
+ touch_atime(link);
nd_set_link(nd, NULL);
error = security_inode_follow_link(link->dentry, nd);
}
EXPORT_SYMBOL(full_name_hash);
+ #ifdef CONFIG_64BIT
#define ONEBYTES 0x0101010101010101ul
#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful
#define HIGHBITS 0x8080808080808080ul
+ #else
+ #define ONEBYTES 0x01010101ul
+ #define SLASHBYTES 0x2f2f2f2ful
+ #define HIGHBITS 0x80808080ul
+ #endif
/* Return the high bit set in the first byte that is a zero */
static inline unsigned long has_zero(unsigned long a)
int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int error = may_create(dir, dentry);
+ unsigned max_links = dir->i_sb->s_max_links;
if (error)
return error;
if (error)
return error;
+ if (max_links && dir->i_nlink >= max_links)
+ return -EMLINK;
+
error = dir->i_op->mkdir(dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
struct inode *inode = old_dentry->d_inode;
+ unsigned max_links = dir->i_sb->s_max_links;
int error;
if (!inode)
/* Make sure we don't allow creating hardlink to an unlinked file */
if (inode->i_nlink == 0)
error = -ENOENT;
+ else if (max_links && inode->i_nlink >= max_links)
+ error = -EMLINK;
else
error = dir->i_op->link(old_dentry, dir, new_dentry);
mutex_unlock(&inode->i_mutex);
{
int error = 0;
struct inode *target = new_dentry->d_inode;
+ unsigned max_links = new_dir->i_sb->s_max_links;
/*
* If we are going to change the parent - check write permissions,
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
+ error = -EMLINK;
+ if (max_links && !target && new_dir != old_dir &&
+ new_dir->i_nlink >= max_links)
+ goto out;
+
if (target)
shrink_dcache_parent(new_dentry);
error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
!vma_growsup(vma->vm_next, addr);
}
+ extern pid_t
+ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
+
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len);
/*
* per-process(per-mm_struct) statistics.
*/
- static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
- {
- atomic_long_set(&mm->rss_stat.count[member], value);
- }
-
- #if defined(SPLIT_RSS_COUNTING)
- unsigned long get_mm_counter(struct mm_struct *mm, int member);
- #else
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- return atomic_long_read(&mm->rss_stat.count[member]);
- }
+ long val = atomic_long_read(&mm->rss_stat.count[member]);
+
+ #ifdef SPLIT_RSS_COUNTING
+ /*
+ * counter is updated in asynchronous manner and may go to minus.
+ * But it's never be expected number for users.
+ */
+ if (val < 0)
+ val = 0;
#endif
+ return (unsigned long)val;
+ }
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
}
#if defined(SPLIT_RSS_COUNTING)
- void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
+ void sync_mm_rss(struct mm_struct *mm);
#else
- static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+ static inline void sync_mm_rss(struct mm_struct *mm)
{
}
#endif
extern unsigned long find_min_pfn_with_active_regions(void);
extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
- int add_from_early_node_map(struct range *range, int az,
- int nr_range, int nid);
extern void sparse_memory_present_with_active_regions(int nid);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
struct inode *inode =
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
if (!inode)
return -ENOMEM;
inode->i_op = &cgroup_dir_inode_operations;
/* directories start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
return -ENOMEM;
- }
- sb->s_root = dentry;
/* for everything else we want ->d_op set */
sb->s_d_op = &cgroup_dops;
return 0;
rcu_assign_pointer(id->css, NULL);
rcu_assign_pointer(css->id, NULL);
- write_lock(&ss->id_lock);
+ spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
- write_unlock(&ss->id_lock);
+ spin_unlock(&ss->id_lock);
kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
error = -ENOMEM;
goto err_out;
}
- write_lock(&ss->id_lock);
+ spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
error = idr_get_new_above(&ss->idr, newid, 1, &myid);
- write_unlock(&ss->id_lock);
+ spin_unlock(&ss->id_lock);
/* Returns error when there are no free spaces for new ID.*/
if (error) {
return newid;
remove_idr:
error = -ENOSPC;
- write_lock(&ss->id_lock);
+ spin_lock(&ss->id_lock);
idr_remove(&ss->idr, myid);
- write_unlock(&ss->id_lock);
+ spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
return ERR_PTR(error);
{
struct css_id *newid;
- rwlock_init(&ss->id_lock);
+ spin_lock_init(&ss->id_lock);
idr_init(&ss->idr);
newid = get_new_cssid(ss, 0);
return NULL;
BUG_ON(!ss->use_id);
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
/* fill start point for scan */
tmpid = id;
while (1) {
* scan next entry from bitmap(tree), tmpid is updated after
* idr_get_next().
*/
- read_lock(&ss->id_lock);
tmp = idr_get_next(&ss->idr, &tmpid);
- read_unlock(&ss->id_lock);
-
if (!tmp)
break;
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
+#include <linux/shm.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
- sync_mm_rss(tsk, tsk->mm);
+ sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
- if (security_vm_enough_memory(len))
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
}
return NULL;
}
+ static void check_mm(struct mm_struct *mm)
+ {
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(mm->pmd_huge_pte);
+ #endif
+ }
+
/*
* Allocate and initialize an mm_struct.
*/
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(mm->pmd_huge_pte);
- #endif
+ check_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+ seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#if defined(SPLIT_RSS_COUNTING)
- static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+ void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (task->rss_stat.count[i]) {
- add_mm_counter(mm, i, task->rss_stat.count[i]);
- task->rss_stat.count[i] = 0;
+ if (current->rss_stat.count[i]) {
+ add_mm_counter(mm, i, current->rss_stat.count[i]);
+ current->rss_stat.count[i] = 0;
}
}
- task->rss_stat.events = 0;
+ current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- __sync_task_rss_stat(task, task->mm);
- }
-
- unsigned long get_mm_counter(struct mm_struct *mm, int member)
- {
- long val = 0;
-
- /*
- * Don't use task->mm here...for avoiding to use task_get_mm()..
- * The caller must guarantee task->mm is not invalid.
- */
- val = atomic_long_read(&mm->rss_stat.count[member]);
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- return 0;
- return (unsigned long)val;
- }
-
- void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
- {
- __sync_task_rss_stat(task, mm);
+ sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */
int i;
if (current->mm == mm)
- sync_mm_rss(current, mm);
+ sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- if (next-addr != HPAGE_PMD_SIZE) {
+ if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
- continue;
+ goto next;
/* fall through */
}
- if (pmd_none_or_clear_bad(pmd))
- continue;
+ /*
+ * Here there can be other concurrent MADV_DONTNEED or
+ * trans huge page faults running, and if the pmd is
+ * none or trans huge it can change under us. This is
+ * because MADV_DONTNEED holds the mmap_sem in read
+ * mode.
+ */
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ next:
cond_resched();
} while (pmd++, addr = next, addr != end);
return addr;
}
-static unsigned long unmap_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- struct zap_details *details)
+static void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
mem_cgroup_uncharge_end();
+}
- return addr;
+
+static void unmap_single_vma(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start_addr,
+ unsigned long end_addr, unsigned long *nr_accounted,
+ struct zap_details *details)
+{
+ unsigned long start = max(vma->vm_start, start_addr);
+ unsigned long end;
+
+ if (start >= vma->vm_end)
+ return;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return;
+
+ if (vma->vm_flags & VM_ACCOUNT)
+ *nr_accounted += (end - start) >> PAGE_SHIFT;
+
+ if (unlikely(is_pfn_mapping(vma)))
+ untrack_pfn_vma(vma, 0, 0);
+
+ if (start != end) {
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ /*
+ * It is undesirable to test vma->vm_file as it
+ * should be non-null for valid hugetlb area.
+ * However, vm_file will be NULL in the error
+ * cleanup path of do_mmap_pgoff. When
+ * hugetlbfs ->mmap method fails,
+ * do_mmap_pgoff() nullifies vma->vm_file
+ * before calling this function to clean up.
+ * Since no pte has actually been setup, it is
+ * safe to do nothing in this case.
+ */
+ if (vma->vm_file)
+ unmap_hugepage_range(vma, start, end, NULL);
+ } else
+ unmap_page_range(tlb, vma, start, end, details);
+ }
}
/**
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
- * Returns the end address of the unmapping (restart addr if interrupted).
- *
* Unmap all pages in the vma list.
*
* Only addresses between `start' and `end' will be unmapped.
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- unsigned long start = start_addr;
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
- unsigned long end;
-
- start = max(vma->vm_start, start_addr);
- if (start >= vma->vm_end)
- continue;
- end = min(vma->vm_end, end_addr);
- if (end <= vma->vm_start)
- continue;
-
- if (vma->vm_flags & VM_ACCOUNT)
- *nr_accounted += (end - start) >> PAGE_SHIFT;
-
- if (unlikely(is_pfn_mapping(vma)))
- untrack_pfn_vma(vma, 0, 0);
-
- while (start != end) {
- if (unlikely(is_vm_hugetlb_page(vma))) {
- /*
- * It is undesirable to test vma->vm_file as it
- * should be non-null for valid hugetlb area.
- * However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When
- * hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file
- * before calling this function to clean up.
- * Since no pte has actually been setup, it is
- * safe to do nothing in this case.
- */
- if (vma->vm_file)
- unmap_hugepage_range(vma, start, end, NULL);
-
- start = end;
- } else
- start = unmap_page_range(tlb, vma, start, end, details);
- }
- }
-
+ for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
+ details);
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
- return start; /* which is now the end (or restart) address */
}
/**
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Caller must protect the VMA list
+ */
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+ unsigned long end = address + size;
+ unsigned long nr_accounted = 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, 0);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ tlb_finish_mmu(&tlb, address, end);
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * The range must fit into one VMA.
*/
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
- end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ mmu_notifier_invalidate_range_start(mm, address, end);
+ unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+ mmu_notifier_invalidate_range_end(mm, address, end);
tlb_finish_mmu(&tlb, address, end);
- return end;
}
/**
if (address < vma->vm_start || address + size > vma->vm_end ||
!(vma->vm_flags & VM_PFNMAP))
return -1;
- zap_page_range(vma, address, size, NULL);
+ zap_page_range_single(vma, address, size, NULL);
return 0;
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
- zap_page_range(vma, start_addr, end_addr - start_addr, details);
+ zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
}
/*
- * Helper for vma_adjust in the split_vma insert case:
- * insert vm structure into list and rbtree and anon_vma,
- * but it has already been inserted into prio_tree earlier.
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree. It has already been inserted into the prio_tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
}
#endif /* CONFIG_PROC_FS */
+/*
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+ hint &= PAGE_MASK;
+ if (((void *)hint != NULL) &&
+ (hint < mmap_min_addr))
+ return PAGE_ALIGN(mmap_min_addr);
+ return hint;
+}
+
/*
* The caller must hold down_write(¤t->mm->mmap_sem).
*/
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE);
if (IS_ERR(file))
return PTR_ERR(file);
}
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
+ if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Is this a new hole at the lowest possible address?
*/
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
mm->free_area_cache = addr;
- mm->cached_hole_size = ~0UL;
- }
}
/*
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
+ unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
mm->free_area_cache = mm->mmap_base;
}
+ try_again:
/* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
+ start_addr = addr = mm->free_area_cache;
- addr = mm->mmap_base-len;
+ if (addr < len)
+ goto fail;
+ addr -= len;
do {
/*
* Lookup failure means no vma is above this address,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
- bottomup:
+ fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ *
+ * Note: this is different with the case of bottomup
+ * which does the fully line-search, but we use find_vma
+ * here that causes some holes skipped.
+ */
+ if (start_addr != mm->mmap_base) {
+ mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = 0;
+ goto try_again;
+ }
+
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- if (security_vm_enough_memory(len >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
- unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
- tlb_finish_mmu(&tlb, 0, end);
+ tlb_finish_mmu(&tlb, 0, -1);
/*
* Walk the list again, actually closing and freeing it,
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(mm, addr, pte, ptent);
- } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
- if (security_vm_enough_memory(charged))
+ if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
return (flags & VM_NORESERVE) ?
- 0 : security_vm_enough_memory_kern(VM_ACCT(size));
+ 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
static inline int shmem_acct_block(unsigned long flags)
{
return (flags & VM_NORESERVE) ?
- security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+ security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
+ #ifdef CONFIG_TMPFS_XATTR
+ static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+ #else
+ #define shmem_initxattrs NULL
+ #endif
+
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
if (inode) {
error = security_inode_init_security(inode, dir,
&dentry->d_name,
- NULL, NULL);
+ shmem_initxattrs, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
return -ENOSPC;
error = security_inode_init_security(inode, dir, &dentry->d_name,
- NULL, NULL);
+ shmem_initxattrs, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
* filesystem level, though.
*/
+ /*
+ * Allocate new xattr and copy in the value; but leave the name to callers.
+ */
+ static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
+ {
+ struct shmem_xattr *new_xattr;
+ size_t len;
+
+ /* wrap around? */
+ len = sizeof(*new_xattr) + size;
+ if (len <= sizeof(*new_xattr))
+ return NULL;
+
+ new_xattr = kmalloc(len, GFP_KERNEL);
+ if (!new_xattr)
+ return NULL;
+
+ new_xattr->size = size;
+ memcpy(new_xattr->value, value, size);
+ return new_xattr;
+ }
+
+ /*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+ static int shmem_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
+ {
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ const struct xattr *xattr;
+ struct shmem_xattr *new_xattr;
+ size_t len;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ len = strlen(xattr->name) + 1;
+ new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+ GFP_KERNEL);
+ if (!new_xattr->name) {
+ kfree(new_xattr);
+ return -ENOMEM;
+ }
+
+ memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+ xattr->name, len);
+
+ spin_lock(&info->lock);
+ list_add(&new_xattr->list, &info->xattr_list);
+ spin_unlock(&info->lock);
+ }
+
+ return 0;
+ }
+
static int shmem_xattr_get(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
return ret;
}
- static int shmem_xattr_set(struct dentry *dentry, const char *name,
+ static int shmem_xattr_set(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_xattr *xattr;
struct shmem_xattr *new_xattr = NULL;
- size_t len;
int err = 0;
/* value == NULL means remove */
if (value) {
- /* wrap around? */
- len = sizeof(*new_xattr) + size;
- if (len <= sizeof(*new_xattr))
- return -ENOMEM;
-
- new_xattr = kmalloc(len, GFP_KERNEL);
+ new_xattr = shmem_xattr_alloc(value, size);
if (!new_xattr)
return -ENOMEM;
kfree(new_xattr);
return -ENOMEM;
}
-
- new_xattr->size = size;
- memcpy(new_xattr->value, value, size);
}
spin_lock(&info->lock);
if (size == 0)
value = ""; /* empty EA, do not remove */
- return shmem_xattr_set(dentry, name, value, size, flags);
+ return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
}
if (err)
return err;
- return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+ return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
}
static bool xattr_is_trusted(const char *name)
int shmem_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- struct dentry *root;
struct shmem_sb_info *sbinfo;
int err = -ENOMEM;
goto failed;
inode->i_uid = sbinfo->uid;
inode->i_gid = sbinfo->gid;
- root = d_alloc_root(inode);
- if (!root)
- goto failed_iput;
- sb->s_root = root;
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ goto failed;
return 0;
-failed_iput:
- iput(inode);
failed:
shmem_put_super(sb);
return err;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (unlikely(pmd_trans_huge(*pmd)))
- continue;
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
if (ret)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ BUG_ON(!current->mm);
+
pathname = getname(specialfile);
err = PTR_ERR(pathname);
if (IS_ERR(pathname))
spin_unlock(&swap_lock);
goto out_dput;
}
- if (!security_vm_enough_memory(p->pages))
+ if (!security_vm_enough_memory_mm(current->mm, p->pages))
vm_unacct_memory(p->pages);
else {
err = -ENOMEM;
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
- if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
+ if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
p->flags |= SWP_DISCARDABLE;
}
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
- /*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
- int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
- {
- struct swap_info_struct *si;
- int our_page_cluster = page_cluster;
- pgoff_t target, toff;
- pgoff_t base, end;
- int nr_pages = 0;
-
- if (!our_page_cluster) /* no readahead */
- return 0;
-
- si = swap_info[swp_type(entry)];
- target = swp_offset(entry);
- base = (target >> our_page_cluster) << our_page_cluster;
- end = base + (1 << our_page_cluster);
- if (!base) /* first page is swap header */
- base++;
-
- spin_lock(&swap_lock);
- if (end > si->max) /* don't go beyond end of map */
- end = si->max;
-
- /* Count contiguous allocated slots above our target */
- for (toff = target; ++toff < end; nr_pages++) {
- /* Don't read in free or bad pages */
- if (!si->swap_map[toff])
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- }
- /* Count contiguous allocated slots below our target */
- for (toff = target; --toff >= base; nr_pages++) {
- /* Don't read in free or bad pages */
- if (!si->swap_map[toff])
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- }
- spin_unlock(&swap_lock);
-
- /*
- * Indicate starting offset, and return number of pages to get:
- * if only 1, say 0, since there's then no readahead to be done.
- */
- *offset = ++toff;
- return nr_pages? ++nr_pages: 0;
- }
-
/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's