forked from leanprover-community/mathlib4
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHashing.lean
More file actions
167 lines (147 loc) · 6.41 KB
/
Hashing.lean
File metadata and controls
167 lines (147 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/-
Copyright (c) 2023 Arthur Paulino. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Arthur Paulino, Jon Eugster
-/
import Cache.IO
import Lean.Elab.ParseImportsFast
namespace Cache.Hashing
open Lean IO
open System hiding SearchPath
/--
The `HashMemo` contains all information `Cache` needs about the modules:
* the name
* its imports
* the file's hash (in `hashMap` and `cache`)
additionally, it contains the `rootHash` which reflects changes to Mathlib's
Lake project settings.
-/
structure HashMemo where
/-- Hash of mathlib's lake project settings. -/
rootHash : UInt64
/-- Maps the `.lean` file of a module to the `.lean` files of its imports. -/
depsMap : Std.HashMap Name (Array Name) := ∅
/--
For files with a valid hash (usually Mathlib and upstream),
this contains the same information as `hashMap`.
Other files have `none` here and do not appear in `hashMap`
(e.g. `.lean` source could not be found, imports a file without valid hash).
-/
cache : Std.HashMap Name (Option UInt64) := ∅
/-- Stores the hash of the module's content for modules in Mathlib or upstream. -/
hashMap : ModuleHashMap := ∅
deriving Inhabited
partial def insertDeps (hashMap : ModuleHashMap) (mod : Name) (hashMemo : HashMemo) :
ModuleHashMap :=
if hashMap.contains mod then hashMap else
match (hashMemo.depsMap[mod]?, hashMemo.hashMap[mod]?) with
| (some deps, some hash) => deps.foldl (insertDeps · · hashMemo) (hashMap.insert mod hash)
| _ => hashMap
/--
Filters the `hashMap` of a `HashMemo` so that it only contains key/value pairs such that every key:
* Belongs to the given list of module names or
* Corresponds to a module that's imported (transitively or not) by
some module in the list module names
-/
def HashMemo.filterByRootModules (hashMemo : HashMemo) (modules : List Name) :
IO ModuleHashMap := do
let mut hashMap := ∅
for mod in modules do
if hashMemo.hashMap.contains mod then
hashMap := insertDeps hashMap mod hashMemo
else IO.println s!"{mod} is not covered by the olean cache."
return hashMap
/-- We cache the hash of each file and their dependencies for later lookup -/
abbrev HashM := StateT HashMemo CacheM
/--
Read the imports from the raw file `content` and return an array of tuples
`(module name, source file)`, one per import.
Note: `fileName` is a string which is purely used for displaying an error message if
parsing imports from `content` should fail. It is intended to be the file's name.
-/
def getFileImports (content : String) (fileName : String := "") :
CacheM <| Array (Name × FilePath) := do
let sp := (← read).srcSearchPath
let res ← Lean.parseImports' content fileName
res.imports
|>.filter (isPartOfMathlibCache ·.module)
|>.mapM fun imp => do
let impSourceFile ← Lean.findLean sp imp.module
pure (imp.module, impSourceFile)
/-- Computes a canonical hash of a file's contents. -/
def hashFileContents (contents : String) : UInt64 :=
-- revert potential file transformation by git's `autocrlf`
hash contents.crlfToLf
/--
Computes the root hash, which mixes the hashes of the content of:
* `lakefile.lean`
* `lean-toolchain`
* `lake-manifest.json`
and the hash of `Lean.githash`.
-/
def getRootHash : CacheM UInt64 := do
let mathlibDepPath := (← read).mathlibDepPath
let rootFiles : List FilePath := [
mathlibDepPath / "lakefile.lean",
mathlibDepPath / "lean-toolchain",
mathlibDepPath / "lake-manifest.json"]
let hashes ← rootFiles.mapM fun path =>
hashFileContents <$> IO.FS.readFile path
return hash (rootHashGeneration :: hashes)
/--
Computes the hash of a file, which mixes:
* The root hash
* The hash of its relative path (inside its package directory)
* The hash of its content
* The hashes of the imported files that are part of `Mathlib`
Note: we pass `sourceFile` along to avoid searching for it twice
-/
partial def getHash (mod : Name) (sourceFile : FilePath) (visited : Std.HashSet Name := ∅) :
HashM <| Option UInt64 := do
if visited.contains mod then
throw <| IO.userError s!"dependency loop found involving {mod}!"
let visitedₙ := visited.insert mod
match (← get).cache[mod]? with
| some hash? => return hash?
| none =>
if !(← sourceFile.pathExists) then
IO.println s!"Warning: {sourceFile} not found. Skipping all files that depend on it."
modify fun stt => { stt with cache := stt.cache.insert mod none }
return none
let content ← IO.FS.readFile sourceFile
let fileImports ← getFileImports content mod.toString
let mut importHashes := #[]
for importHash? in
← fileImports.mapM (fun (modₙ, sourceFileₙ) => getHash modₙ sourceFileₙ visitedₙ) do
match importHash? with
| some importHash => importHashes := importHashes.push importHash
| none =>
-- one import did not have a hash: invalidate hash of this module
modify fun stt => { stt with cache := stt.cache.insert mod none }
return none
/-
TODO: Currently, the cache uses the hash of the unresolved file name
(e.g. `Mathlib/Init.lean`) which is reconstructed from the module name
(e.g. `Mathlib.Init`) in `path`. It could, however, directly use `hash mod` instead.
We can change this at any time causing a one-time cache invalidation, just as
a toolchain-bump would.
-/
let filePath := mkFilePath (mod.components.map toString) |>.withExtension "lean"
let rootHash := (← get).rootHash
let pathHash := hash filePath.components -- TODO: change to `hash mod`
let fileHash := hash <| rootHash :: pathHash :: hashFileContents content :: importHashes.toList
modifyGet fun stt =>
(some fileHash, { stt with
hashMap := stt.hashMap.insert mod fileHash
cache := stt.cache.insert mod (some fileHash)
depsMap := stt.depsMap.insert mod (fileImports.map (·.1)) })
/-- Files to start hashing from. -/
def roots : CacheM <| Array <| Name × FilePath := do
let mathlibDepPath := (← read).mathlibDepPath
return #[(`Mathlib, (mathlibDepPath / "Mathlib.lean"))]
/-- Main API to retrieve the hashes of the Lean files -/
def getHashMemo (extraRoots : Std.HashMap Name FilePath) : CacheM HashMemo :=
-- TODO: `Std.HashMap.mapM` seems not to exist yet, so we go via `.toArray`.
return (← StateT.run ((extraRoots.insertMany (← roots)).toArray.mapM fun
⟨key, val⟩ => getHash key val) { rootHash := ← getRootHash }).2
end Cache.Hashing