Skip to content

Commit 6ed1d66

Browse files
committed
feat: use leantar in lake exe cache (leanprover-community#5710)
This hooks up the new `leantar` tool from https://github.com/digama0/leangz to `lake exe cache`. It uses an olean compressor to achieve much smaller file sizes, and faster decompression. See the [Zulip discussion](https://leanprover.zulipchat.com/#narrow/stream/270676-lean4/topic/leangz.3A.20olean.20file.20compressor/near/369738648). The new files have a new file extension, `123.ltar` instead of `123.tar.gz`. This implementation does not multiplex between them, so it will be a hard reset. In fact, the new cache doesn't even know how to clear the old cache files, so you might need to do that manually. Switching between branches will still work during the interim, since the cache will have both kinds of file and use the right ones.
1 parent cf365f0 commit 6ed1d66

File tree

3 files changed

+91
-23
lines changed

3 files changed

+91
-23
lines changed

Cache/IO.lean

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Authors: Arthur Paulino
77
import Lean.Data.HashMap
88
import Lean.Data.RBMap
99
import Lean.Data.RBTree
10+
import Lean.Data.Json.Printer
1011

1112
/-- Removes a parent path from the beginning of a path -/
1213
def System.FilePath.withoutParent (path parent : FilePath) : FilePath :=
@@ -16,8 +17,15 @@ def System.FilePath.withoutParent (path parent : FilePath) : FilePath :=
1617
| x, [] => x
1718
mkFilePath $ aux path.components parent.components
1819

19-
def UInt64.asTarGz (n : UInt64) : String :=
20-
s!"{n}.tar.gz"
20+
def Nat.toHexDigits (n : Nat) : Nat → (res : String := "") → String
21+
| 0, s => s
22+
| len+1, s =>
23+
let b := UInt8.ofNat (n >>> (len * 8))
24+
Nat.toHexDigits n len <|
25+
s.push (Nat.digitChar (b >>> 4).toNat) |>.push (Nat.digitChar (b &&& 15).toNat)
26+
27+
def UInt64.asLTar (n : UInt64) : String :=
28+
s!"{Nat.toHexDigits n.toNat 8}.ltar"
2129

2230
namespace Cache.IO
2331

@@ -51,12 +59,23 @@ def CURLBIN :=
5159
-- change file name if we ever need a more recent version to trigger re-download
5260
IO.CACHEDIR / s!"curl-{CURLVERSION}"
5361

62+
/-- leantar version at https://github.com/digama0/leangz -/
63+
def LEANTARVERSION :=
64+
"0.1.1"
65+
66+
def LEANTARBIN :=
67+
-- change file name if we ever need a more recent version to trigger re-download
68+
IO.CACHEDIR / s!"leantar-{LEANTARVERSION}"
69+
5470
def LAKEPACKAGESDIR : FilePath :=
5571
"lake-packages"
5672

5773
def getCurl : IO String := do
5874
return if (← CURLBIN.pathExists) then CURLBIN.toString else "curl"
5975

76+
def getLeanTar : IO String := do
77+
return if (← LEANTARBIN.pathExists) then LEANTARBIN.toString else "leantar"
78+
6079
abbrev PackageDirs := Lean.RBMap String FilePath compare
6180

6281
/-- Whether this is running on Mathlib repo or not -/
@@ -136,6 +155,44 @@ def validateCurl : IO Bool := do
136155
| _ => throw $ IO.userError "Invalidly formatted version of `curl`"
137156
| _ => throw $ IO.userError "Invalidly formatted response from `curl --version`"
138157

158+
def Version := Nat × Nat × Nat
159+
deriving Inhabited, DecidableEq
160+
161+
instance : Ord Version := let _ := @lexOrd; lexOrd
162+
instance : LE Version := leOfOrd
163+
164+
def parseVersion (s : String) : Option Version := do
165+
let [maj, min, patch] := s.splitOn "." | none
166+
some (maj.toNat!, min.toNat!, patch.toNat!)
167+
168+
def validateLeanTar : IO Unit := do
169+
if (← LEANTARBIN.pathExists) then return
170+
if let some version ← some <$> runCmd "leantar" #["--version"] <|> pure none then
171+
let "leantar" :: v :: _ := version.splitOn " "
172+
| throw $ IO.userError "Invalidly formatted response from `leantar --version`"
173+
let some v := parseVersion v | throw $ IO.userError "Invalidly formatted version of `leantar`"
174+
-- currently we need exactly one version of leantar, change this to reflect compatibility
175+
if v = (parseVersion LEANTARVERSION).get! then return
176+
let win := System.Platform.getIsWindows ()
177+
let target ← if win then
178+
pure "x86_64-pc-windows-msvc"
179+
else
180+
let arch ← (·.trim) <$> runCmd "uname" #["-m"] false
181+
unless arch ∈ ["x86_64", "aarch64"] do
182+
throw $ IO.userError s!"unsupported architecture {arch}"
183+
pure <|
184+
if System.Platform.getIsOSX () then s!"{arch}-apple-darwin"
185+
else s!"{arch}-unknown-linux-gnu"
186+
IO.println s!"leantar is too old; downloading more recent version"
187+
IO.FS.createDirAll IO.CACHEDIR
188+
let ext := if win then "zip" else "tar.xz"
189+
let _ ← runCmd "curl" #[
190+
s!"https://github.com/digama0/leangz/releases/download/v{LEANTARVERSION}/leantar-v{LEANTARVERSION}-{target}.{ext}",
191+
"-L", "-o", s!"{LEANTARBIN}.{ext}"]
192+
let _ ← runCmd "tar" #["-xf", s!"{LEANTARBIN}.{ext}",
193+
"-C", IO.CACHEDIR.toString, "--strip-components=1"]
194+
let _ ← runCmd "mv" #[(IO.CACHEDIR / s!"leantar").toString, LEANTARBIN.toString]
195+
139196
/-- Recursively gets all files from a directory with a certain extension -/
140197
partial def getFilesWithExtension
141198
(fp : FilePath) (extension : String) (acc : Array FilePath := #[]) :
@@ -150,7 +207,7 @@ namespace HashMap
150207

151208
def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap :=
152209
hashMap.fold (init := default) fun acc path hash =>
153-
let contains := set.contains hash.asTarGz
210+
let contains := set.contains hash.asLTar
154211
let add := if keep then contains else !contains
155212
if add then acc.insert path hash else acc
156213

@@ -169,9 +226,9 @@ Each build file also has a `Bool` indicating whether that file is required for c
169226
def mkBuildPaths (path : FilePath) : IO $ Array (FilePath × Bool) := do
170227
let packageDir ← getPackageDir path
171228
return #[
229+
(packageDir / LIBDIR / path.withExtension "trace", true),
172230
(packageDir / LIBDIR / path.withExtension "olean", true),
173231
(packageDir / LIBDIR / path.withExtension "ilean", true),
174-
(packageDir / LIBDIR / path.withExtension "trace", true),
175232
(packageDir / IRDIR / path.withExtension "c", true),
176233
(packageDir / LIBDIR / path.withExtension "extra", false)]
177234

@@ -185,21 +242,25 @@ def allExist (paths : Array (FilePath × Bool)) : IO Bool := do
185242
def packCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do
186243
mkDir CACHEDIR
187244
IO.println "Compressing cache"
188-
let mut acc := default
245+
let mut acc := #[]
246+
let mut tasks := #[]
189247
for (path, hash) in hashMap.toList do
190-
let zip := hash.asTarGz
248+
let zip := hash.asLTar
191249
let zipPath := CACHEDIR / zip
192250
let buildPaths ← mkBuildPaths path
193251
if ← allExist buildPaths then
194252
if overwrite || !(← zipPath.pathExists) then
195-
discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++
196-
((← buildPaths.filterM (·.1.pathExists)) |>.map (·.1.toString))
253+
tasks := tasks.push <| ← IO.asTask do
254+
runCmd (← getLeanTar) $ #[zipPath.toString] ++
255+
((← buildPaths.filterM (·.1.pathExists)) |>.map (·.1.toString))
197256
acc := acc.push zip
257+
for task in tasks do
258+
_ ← IO.ofExcept task.get
198259
return acc
199260

200261
/-- Gets the set of all cached files -/
201262
def getLocalCacheSet : IO $ Lean.RBTree String compare := do
202-
let paths ← getFilesWithExtension CACHEDIR "gz"
263+
let paths ← getFilesWithExtension CACHEDIR "ltar"
203264
return .fromList (paths.data.map (·.withoutParent CACHEDIR |>.toString)) _
204265

205266
def isPathFromMathlib (path : FilePath) : Bool :=
@@ -215,19 +276,22 @@ def unpackCache (hashMap : HashMap) : IO Unit := do
215276
let hashMap := hashMap.filter (← getLocalCacheSet) true
216277
let size := hashMap.size
217278
if size > 0 then
279+
let now ← IO.monoMsNow
218280
IO.println s!"Decompressing {size} file(s)"
219281
let isMathlibRoot ← isMathlibRoot
220-
hashMap.forM fun path hash => do
221-
let _ ← IO.asTask do
222-
match path.parent with
223-
| none | some path => do
224-
let packageDir ← getPackageDir path
225-
mkDir $ packageDir / LIBDIR / path
226-
mkDir $ packageDir / IRDIR / path
282+
let child ← IO.Process.spawn
283+
{ cmd := ← getLeanTar, args := #["-x", "-j", "-"], stdin := .piped }
284+
let (stdin, child) ← child.takeStdin
285+
let config : Array Lean.Json := hashMap.fold (init := #[]) fun config path hash =>
286+
let pathStr := s!"{CACHEDIR / hash.asLTar}"
227287
if isMathlibRoot || !isPathFromMathlib path then
228-
runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"]
288+
config.push <| .str pathStr
229289
else -- only mathlib files, when not in the mathlib4 repo, need to be redirected
230-
runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}", "-C", mathlibDepPath.toString]
290+
config.push <| .mkObj [("file", pathStr), ("base", mathlibDepPath.toString)]
291+
stdin.putStr <| Lean.Json.compress <| .arr config
292+
let exitCode ← child.wait
293+
if exitCode != 0 then throw $ IO.userError s!"leantar failed with error code {exitCode}"
294+
IO.println s!"unpacked in {(← IO.monoMsNow) - now} ms"
231295
else IO.println "No cache files to decompress"
232296

233297
/-- Retrieves the azure token from the environment -/
@@ -241,7 +305,7 @@ instance : Ord FilePath where
241305

242306
/-- Removes all cache files except for what's in the `keep` set -/
243307
def cleanCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do
244-
for path in ← getFilesWithExtension CACHEDIR "gz" do
308+
for path in ← getFilesWithExtension CACHEDIR "ltar" do
245309
if !keep.contains path then IO.FS.removeFile path
246310

247311
end Cache.IO

Cache/Main.lean

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,15 @@ def toPaths (args : List String) : List FilePath :=
5353
def curlArgs : List String :=
5454
["get", "get!", "get-", "put", "put!", "commit", "commit!"]
5555

56+
def leanTarArgs : List String :=
57+
["get", "get!", "pack", "pack!", "unpack"]
58+
5659
open Cache IO Hashing Requests in
5760
def main (args : List String) : IO Unit := do
5861
let hashMemo ← getHashMemo
5962
let hashMap := hashMemo.hashMap
60-
let goodCurl := !(curlArgs.contains (args.headD "") && !(← validateCurl))
63+
let goodCurl ← pure !curlArgs.contains (args.headD "") <||> validateCurl
64+
if leanTarArgs.contains (args.headD "") then validateLeanTar
6165
match args with
6266
| ["get"] => getFiles hashMap false goodCurl true
6367
| ["get!"] => getFiles hashMap true goodCurl true
@@ -69,7 +73,7 @@ def main (args : List String) : IO Unit := do
6973
| ["pack!"] => discard $ packCache hashMap true
7074
| ["unpack"] => unpackCache hashMap
7175
| ["clean"] =>
72-
cleanCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty
76+
cleanCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asLTar) .empty
7377
| ["clean!"] => cleanCache
7478
| ["put"] => putFiles (← packCache hashMap false) false (← getToken)
7579
| ["put!"] => putFiles (← packCache hashMap false) true (← getToken)

Cache/Requests.lean

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def mkGetConfigContent (hashMap : IO.HashMap) : IO String := do
3131
-- We sort the list so that the large files in `MathlibExtras` are requested first.
3232
hashMap.toArray.qsort (fun ⟨p₁, _⟩ ⟨_, _⟩ => p₁.components.head? = "MathlibExtras")
3333
|>.foldlM (init := "") fun acc ⟨_, hash⟩ => do
34-
let fileName := hash.asTarGz
34+
let fileName := hash.asLTar
3535
-- Below we use `String.quote`, which is intended for quoting for use in Lean code
3636
-- this does not exactly match the requirements for quoting for curl:
3737
-- ```
@@ -47,7 +47,7 @@ def mkGetConfigContent (hashMap : IO.HashMap) : IO String := do
4747

4848
/-- Calls `curl` to download a single file from the server to `CACHEDIR` (`.cache`) -/
4949
def downloadFile (hash : UInt64) : IO Bool := do
50-
let fileName := hash.asTarGz
50+
let fileName := hash.asLTar
5151
let url ← mkFileURL fileName none
5252
let path := IO.CACHEDIR / fileName
5353
let out ← IO.Process.output

0 commit comments

Comments
 (0)