@@ -305,6 +305,142 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
305305 return pte_young (ptep_get (ptep ));
306306}
307307
308+ static bool fault_supports_gstage_huge_mapping (struct kvm_memory_slot * memslot ,
309+ unsigned long hva )
310+ {
311+ hva_t uaddr_start , uaddr_end ;
312+ gpa_t gpa_start ;
313+ size_t size ;
314+
315+ size = memslot -> npages * PAGE_SIZE ;
316+ uaddr_start = memslot -> userspace_addr ;
317+ uaddr_end = uaddr_start + size ;
318+
319+ gpa_start = memslot -> base_gfn << PAGE_SHIFT ;
320+
321+ /*
322+ * Pages belonging to memslots that don't have the same alignment
323+ * within a PMD for userspace and GPA cannot be mapped with g-stage
324+ * PMD entries, because we'll end up mapping the wrong pages.
325+ *
326+ * Consider a layout like the following:
327+ *
328+ * memslot->userspace_addr:
329+ * +-----+--------------------+--------------------+---+
330+ * |abcde|fgh vs-stage block | vs-stage block tv|xyz|
331+ * +-----+--------------------+--------------------+---+
332+ *
333+ * memslot->base_gfn << PAGE_SHIFT:
334+ * +---+--------------------+--------------------+-----+
335+ * |abc|def g-stage block | g-stage block |tvxyz|
336+ * +---+--------------------+--------------------+-----+
337+ *
338+ * If we create those g-stage blocks, we'll end up with this incorrect
339+ * mapping:
340+ * d -> f
341+ * e -> g
342+ * f -> h
343+ */
344+ if ((gpa_start & (PMD_SIZE - 1 )) != (uaddr_start & (PMD_SIZE - 1 )))
345+ return false;
346+
347+ /*
348+ * Next, let's make sure we're not trying to map anything not covered
349+ * by the memslot. This means we have to prohibit block size mappings
350+ * for the beginning and end of a non-block aligned and non-block sized
351+ * memory slot (illustrated by the head and tail parts of the
352+ * userspace view above containing pages 'abcde' and 'xyz',
353+ * respectively).
354+ *
355+ * Note that it doesn't matter if we do the check using the
356+ * userspace_addr or the base_gfn, as both are equally aligned (per
357+ * the check above) and equally sized.
358+ */
359+ return (hva >= ALIGN (uaddr_start , PMD_SIZE )) && (hva < ALIGN_DOWN (uaddr_end , PMD_SIZE ));
360+ }
361+
362+ static int get_hva_mapping_size (struct kvm * kvm ,
363+ unsigned long hva )
364+ {
365+ int size = PAGE_SIZE ;
366+ unsigned long flags ;
367+ pgd_t pgd ;
368+ p4d_t p4d ;
369+ pud_t pud ;
370+ pmd_t pmd ;
371+
372+ /*
373+ * Disable IRQs to prevent concurrent tear down of host page tables,
374+ * e.g. if the primary MMU promotes a P*D to a huge page and then frees
375+ * the original page table.
376+ */
377+ local_irq_save (flags );
378+
379+ /*
380+ * Read each entry once. As above, a non-leaf entry can be promoted to
381+ * a huge page _during_ this walk. Re-reading the entry could send the
382+ * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
383+ * value) and then p*d_offset() walks into the target huge page instead
384+ * of the old page table (sees the new value).
385+ */
386+ pgd = pgdp_get (pgd_offset (kvm -> mm , hva ));
387+ if (pgd_none (pgd ))
388+ goto out ;
389+
390+ p4d = p4dp_get (p4d_offset (& pgd , hva ));
391+ if (p4d_none (p4d ) || !p4d_present (p4d ))
392+ goto out ;
393+
394+ pud = pudp_get (pud_offset (& p4d , hva ));
395+ if (pud_none (pud ) || !pud_present (pud ))
396+ goto out ;
397+
398+ if (pud_leaf (pud )) {
399+ size = PUD_SIZE ;
400+ goto out ;
401+ }
402+
403+ pmd = pmdp_get (pmd_offset (& pud , hva ));
404+ if (pmd_none (pmd ) || !pmd_present (pmd ))
405+ goto out ;
406+
407+ if (pmd_leaf (pmd ))
408+ size = PMD_SIZE ;
409+
410+ out :
411+ local_irq_restore (flags );
412+ return size ;
413+ }
414+
415+ static unsigned long transparent_hugepage_adjust (struct kvm * kvm ,
416+ struct kvm_memory_slot * memslot ,
417+ unsigned long hva ,
418+ kvm_pfn_t * hfnp , gpa_t * gpa )
419+ {
420+ kvm_pfn_t hfn = * hfnp ;
421+
422+ /*
423+ * Make sure the adjustment is done only for THP pages. Also make
424+ * sure that the HVA and GPA are sufficiently aligned and that the
425+ * block map is contained within the memslot.
426+ */
427+ if (fault_supports_gstage_huge_mapping (memslot , hva )) {
428+ int sz ;
429+
430+ sz = get_hva_mapping_size (kvm , hva );
431+ if (sz < PMD_SIZE )
432+ return sz ;
433+
434+ * gpa &= PMD_MASK ;
435+ hfn &= ~(PTRS_PER_PMD - 1 );
436+ * hfnp = hfn ;
437+
438+ return PMD_SIZE ;
439+ }
440+
441+ return PAGE_SIZE ;
442+ }
443+
308444int kvm_riscv_mmu_map (struct kvm_vcpu * vcpu , struct kvm_memory_slot * memslot ,
309445 gpa_t gpa , unsigned long hva , bool is_write ,
310446 struct kvm_gstage_mapping * out_map )
@@ -398,6 +534,10 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
398534 if (mmu_invalidate_retry (kvm , mmu_seq ))
399535 goto out_unlock ;
400536
537+ /* Check if we are backed by a THP and thus use block mapping if possible */
538+ if (vma_pagesize == PAGE_SIZE )
539+ vma_pagesize = transparent_hugepage_adjust (kvm , memslot , hva , & hfn , & gpa );
540+
401541 if (writable ) {
402542 mark_page_dirty_in_slot (kvm , memslot , gfn );
403543 ret = kvm_riscv_gstage_map_page (& gstage , pcache , gpa , hfn << PAGE_SHIFT ,
0 commit comments