Linux kernel ============ There are several guides for kernel developers and users. These guides can be rendered in a number of formats, like HTML and PDF. Please read Documentation/admin-guide/README.rst first. In order to build the documentation, use ``make htmldocs`` or ``make pdfdocs``. The formatted documentation can also be read online at: https://www.kernel.org/doc/html/latest/ There are various text files in the Documentation/ subdirectory, several of them using the Restructured Text markup notation. Please read the Documentation/process/changes.rst file, as it contains the requirements for building and running the kernel, and information about the problems which may result by upgrading your kernel.
David Hildenbrand
authored
Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge zeropage, resulting in the next write faults in the PMD range not triggering uffd-wp events. Various actions (partial MADV_DONTNEED, partial mremap, partial munmap, partial mprotect) could trigger this. However, most importantly, un-protecting a single sub-page from the userfaultfd-wp handler when processing a uffd-wp event will PTE-map the shared huge zeropage and lose the uffd-wp bit for the remainder of the PMD. Let's properly propagate the uffd-wp bit to the PMDs. #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <stdbool.h> #include <inttypes.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <poll.h> #include <pthread.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> static size_t pagesize; static int uffd; static volatile bool uffd_triggered; #define barrier() __asm__ __volatile__("": : :"memory") static void uffd_wp_range(char *start, size_t size, bool wp) { struct uffdio_writeprotect uffd_writeprotect; uffd_writeprotect.range.start = (unsigned long) start; uffd_writeprotect.range.len = size; if (wp) { uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; } else { uffd_writeprotect.mode = 0; } if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); exit(1); } } static void *uffd_thread_fn(void *arg) { static struct uffd_msg msg; ssize_t nread; while (1) { struct pollfd pollfd; int nready; pollfd.fd = uffd; pollfd.events = POLLIN; nready = poll(&pollfd, 1, -1); if (nready == -1) { fprintf(stderr, "poll() failed: %d\n", errno); exit(1); } nread = read(uffd, &msg, sizeof(msg)); if (nread <= 0) continue; if (msg.event != UFFD_EVENT_PAGEFAULT || !(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) { printf("FAIL: wrong uffd-wp event fired\n"); exit(1); } /* un-protect the single page. */ uffd_triggered = true; uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address, pagesize, false); } return arg; } static int setup_uffd(char *map, size_t size) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; pthread_t thread; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; } uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; } uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = size; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; } pthread_create(&thread, NULL, uffd_thread_fn, NULL); return 0; } int main(void) { const size_t size = 4 * 1024 * 1024ull; char *map, *cur; pagesize = getpagesize(); map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } if (madvise(map, size, MADV_HUGEPAGE)) { fprintf(stderr, "MADV_HUGEPAGE failed\n"); return -errno; } if (setup_uffd(map, size)) return 1; /* Read the whole range, populating zeropages. */ madvise(map, size, MADV_POPULATE_READ); /* Write-protect the whole range. */ uffd_wp_range(map, size, true); /* Make sure uffd-wp triggers on each page. */ for (cur = map; cur < map + size; cur += pagesize) { uffd_triggered = false; barrier(); /* Trigger a write fault. */ *cur = 1; barrier(); if (!uffd_triggered) { printf("FAIL: uffd-wp did not trigger\n"); return 1; } } printf("PASS: uffd-wp triggered\n"); return 0; } Link: https://lkml.kernel.org/r/20230302175423.589164-1-david@redhat.com Fixes: e06f1e1d ("userfaultfd: wp: enabled write protection in userfaultfd API") Signed-off-by:David Hildenbrand <david@redhat.com> Acked-by:
Peter Xu <peterx@redhat.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: Shaohua Li <shli@fb.com> Cc: <stable@vger.kernel.org> Signed-off-by:
Andrew Morton <akpm@linux-foundation.org>