From f5bdac4568ed15971e7ddda19478f70463516b7b Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 14:40:25 -0600 Subject: [PATCH 01/21] run in docker --- .gitignore | 4 +++- Dockerfile | 29 +++++++++++++++++++++++++++++ compose.yml | 29 +++++++++++++++++++++++++++++ entrypoint.sh | 18 ++++++++++++++++++ example.env | 6 ++++++ requirements.txt | 3 +-- 6 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 Dockerfile create mode 100644 compose.yml create mode 100644 entrypoint.sh create mode 100644 example.env diff --git a/.gitignore b/.gitignore index 89c7a72..eee0255 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,6 @@ cython_debug/ #.idea/ bittensor-subnet-template/ wandb/ -.vscode/ \ No newline at end of file +.vscode/ + +wallets \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f040e23 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM nvcr.io/nvidia/cuda:12.2.0-devel-ubuntu22.04 + +LABEL sponsor="Hivetrain" + +ENV DEBIAN_FRONTEND="noninteractive" + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + python3-packaging \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt requirements.txt + +RUN pip install -r requirements.txt && \ + pip cache purge + +COPY ./ /app + +RUN pip install -e . && \ + pip cache purge + +RUN python3 post_install.py + +ENTRYPOINT "bash ./entrypoint.sh" \ No newline at end of file diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..205fa5f --- /dev/null +++ b/compose.yml @@ -0,0 +1,29 @@ +version: '3.9' + +services: + hivetrain: + image: ghcr.io/bit-current/distributedtraining:latest + entrypoint: bash ./entrypoint.sh + restart: 'always' + ipc: host + network_mode: host + tty: true + stdin_open: true + build: + dockerfile: Dockerfile + volumes: + - ./wallets:/root/.bittensor/wallets + deploy: + resources: + reservations: + devices: + - capabilities: ["gpu"] + count: all + environment: + NETUID: ${NETUID:-25} + WALLETNAME: ${WALLETNAME:-default} + WALLETHOTKEY: ${WALLETHOTKEY:-defaulthotkey} + DHTPORT: ${DHTPORT:-42316} + AXONPORT: ${AXONPORT:-42310} + env_file: + - .env diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..cc7e0a4 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cd /app/neurons + +python3 miner.py \ + --netuid ${NETUID} \ + --wallet.name ${WALLETNAME} \ + --wallet.hotkey ${WALLETHOTKEY} \ + --dht.port ${DHTPORT} \ + --dht.announce_ip ${EXTERNALIP} \ + --axon.port ${AXONPORT} \ + --axon.external_ip ${EXTERNALIP} + +# while [ true ] +# do +# echo "I'm dead." +# sleep 5 +# done \ No newline at end of file diff --git a/example.env b/example.env new file mode 100644 index 0000000..315c76d --- /dev/null +++ b/example.env @@ -0,0 +1,6 @@ +NETUID=25 +WALLETNAME='test' +WALLETHOTKEY='test' +WANDB_API_KEY='3f1fcc13170ff677e6ecb14d7ff961b0b77684eb' +DHTPORT=42316 +EXTERNALIP=104.202.156.242 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 078e268..dcdba66 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,4 @@ bitarray==2.9.2 datasets transformers hivemind -wandb -bitarray \ No newline at end of file +wandb \ No newline at end of file From c912c1925a199ec33837e71bb132fbe5888ba402 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 14:41:12 -0600 Subject: [PATCH 02/21] nits --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f4bcfd..3f381eb 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ Done : Train TINYGPT ## How Miners are Rewarded -Hivetrain uses a simmple score assignment system designed to reward users for their participation and adherence to network guidelines. The system evaluates two critical aspects of user behavior: responsiveness and loss values. By applying a set of predefined rules, we aim to foster a healthy and productive network environment where all participants are incentivized to contribute positively. Whilst maintaining network integrity with few gameable variables. +Hivetrain uses a simple score assignment system designed to reward users for their participation and adherence to network guidelines. The system evaluates two critical aspects of user behavior: responsiveness and loss values. By applying a set of predefined rules, we aim to foster a healthy and productive network environment where all participants are incentivized to contribute positively. Whilst maintaining network integrity with few gameable variables. ### 1.0 Users who actively respond to network activities and maintain their losses within an acceptable threshold are awarded a score of 1.0. This top score reflects exemplary user behavior and strict adherence to network standards, highlighting the user as a model participant. From acaf08296556a02a2f08ffb6411cd73531702de8 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 16:07:04 -0600 Subject: [PATCH 03/21] redact api key --- example.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.env b/example.env index 315c76d..744e6fc 100644 --- a/example.env +++ b/example.env @@ -1,6 +1,6 @@ NETUID=25 WALLETNAME='test' WALLETHOTKEY='test' -WANDB_API_KEY='3f1fcc13170ff677e6ecb14d7ff961b0b77684eb' +WANDB_API_KEY='' DHTPORT=42316 EXTERNALIP=104.202.156.242 \ No newline at end of file From 894bf33a1797fb4a140709a4ad46e3773a589303 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 20:12:50 -0600 Subject: [PATCH 04/21] the code is terrible, but hivemind + lightning works --- Dockerfile | 3 + compose.yml | 1 + entrypoint.sh | 18 +-- neurons/minerz.py | 271 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 5 files changed, 287 insertions(+), 9 deletions(-) create mode 100644 neurons/minerz.py diff --git a/Dockerfile b/Dockerfile index f040e23..332361d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ ENV DEBIAN_FRONTEND="noninteractive" RUN apt-get update \ && apt-get install -y --no-install-recommends \ + git \ python3-dev \ python3-pip \ python3-packaging \ @@ -21,6 +22,8 @@ RUN pip install -r requirements.txt && \ COPY ./ /app +RUN pip install git+https://github.com/LuciferianInk/lightning-Hivemind.git + RUN pip install -e . && \ pip cache purge diff --git a/compose.yml b/compose.yml index 205fa5f..8ae2c92 100644 --- a/compose.yml +++ b/compose.yml @@ -12,6 +12,7 @@ services: build: dockerfile: Dockerfile volumes: + - ./neurons:/app/neurons - ./wallets:/root/.bittensor/wallets deploy: resources: diff --git a/entrypoint.sh b/entrypoint.sh index cc7e0a4..a401f69 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,14 +2,16 @@ cd /app/neurons -python3 miner.py \ - --netuid ${NETUID} \ - --wallet.name ${WALLETNAME} \ - --wallet.hotkey ${WALLETHOTKEY} \ - --dht.port ${DHTPORT} \ - --dht.announce_ip ${EXTERNALIP} \ - --axon.port ${AXONPORT} \ - --axon.external_ip ${EXTERNALIP} +python3 minerz.py + +# python3 miner.py \ +# --netuid ${NETUID} \ +# --wallet.name ${WALLETNAME} \ +# --wallet.hotkey ${WALLETHOTKEY} \ +# --dht.port ${DHTPORT} \ +# --dht.announce_ip ${EXTERNALIP} \ +# --axon.port ${AXONPORT} \ +# --axon.external_ip ${EXTERNALIP} # while [ true ] # do diff --git a/neurons/minerz.py b/neurons/minerz.py new file mode 100644 index 0000000..c0acca5 --- /dev/null +++ b/neurons/minerz.py @@ -0,0 +1,271 @@ +import random +import numpy as np +import torch +from torch.utils.data import DataLoader, Dataset, IterableDataset +from torch.optim import AdamW +from lightning.fabric.utilities.seed import reset_seed, seed_everything +from lightning.pytorch.accelerators import TPUAccelerator +from lightning.pytorch.core.datamodule import LightningDataModule +from lightning.pytorch.callbacks import ( + ModelCheckpoint, + ModelPruning, + StochasticWeightAveraging, +) +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.utilities import CombinedLoader +from lightning.pytorch import LightningModule + +from lightning_hivemind.strategy import HivemindStrategy +from functools import partial + +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, +) +from datasets import load_dataset + +# initialized and load the model +config = AutoConfig.from_pretrained( + "gpt2", + n_emb=256, + n_layer=3, + n_head=3, + n_inner=3, + torch_dtype=torch.float32 +) +model = AutoModelForCausalLM.from_config(config) +tokenizer = AutoTokenizer.from_pretrained( + "openai-community/gpt2", + cache_dir="/tmp/tokenizer", + padding="max_length", + padding_side="left", + use_fast=True, + return_overflowing_tokens=True, + truncation=True, +) +tokenizer.pad_token = tokenizer.eos_token + +# create a datamodule to wrap our remote datasets +class StreamingDataModule(LightningDataModule): + def __init__(self, tokenizer, config): + super().__init__() + self.tokenizer = tokenizer + self.config = config + self.train_data = None + # self.setup(self, self.config, stage=None) + self.train_data = StreamingDataset( + self.tokenizer, config + ) + + # def setup(self, config, stage=None): + + + def train_dataloader(self): + return DataLoader( + self.train_data, + batch_size=1, + pin_memory=True, + num_workers=2, + ) + +class StreamingDataset(IterableDataset): + def __init__(self, tokenizer, conf): + self.tokenizer = tokenizer + self.config = conf + self.dataset = load_dataset( + self.config.get("dataset", "tiiuae/falcon-refinedweb"), + split=self.config.get("split", "train"), + streaming=True, + cache_dir="/tmp/pile" + ) + + def __iter__(self): + shuffled = self.dataset.shuffle( + seed=random.randint(0, 2**31), + buffer_size=1000, + ) + + block_size = 512 + + batch = [] + for document in shuffled: + tokenized = self.tokenizer( + text=document.get(self.config.get("key", "default")), + max_length=block_size, + stride=0, + padding=True, + truncation=True, + return_overflowing_tokens=True, + return_tensors="np", + )["input_ids"] + choice = random.choice(tokenized) + if len(choice) == 0: + continue + elif len(batch) == 0: + batch = choice + else: + np.append(batch, self.tokenizer.eos_token_id) + batch = np.concatenate([batch, choice]) + if len(batch) >= block_size: + yield batch[:block_size] + batch = [] + else: + continue + +# prepare a dataset for use with training +dataset = StreamingDataModule(tokenizer, { + "dataset": "tiiuae/falcon-refinedweb", + "key": "content", + "split": "train", +}) + +# wrap the LightningModule in a custom class +class MinerTrainer(LightningModule): + """ + A training module for AIGen. + """ + + def __init__(self, model, optimizer, tokenizer, hparams): + super(MinerTrainer, self).__init__() + + self.model, self.optimizer, self.tokenizer = ( + model, + optimizer, + tokenizer, + ) + self.automatic_optimization = True + self.save_hyperparameters(hparams) + + def forward(self, inputs): + return self.model(**inputs) + + def training_step(self, batch, batch_idx): + outputs = self({"input_ids": batch, "labels": batch}) + loss = outputs[0] + return loss + + def on_train_batch_end(self, trainer, lm, outputs): + schedule = self.lr_schedulers() + step = self.global_step + + if hasattr(schedule, "current_step"): + step = schedule.current_step + elif hasattr(self.trainer.strategy.optimizers[0], "local_epoch"): + step = self.trainer.strategy.optimizers[0].local_epoch + + if hasattr(schedule, "step"): + schedule.step() + + def configure_optimizers(self): + "Create optimizer and scheduler" + + # if self.scheduler: + # return [self.optimizer], [self.scheduler()] + return [self.optimizer] + +# define the model hyperparameters +hparams = dict( + # optimizer="AdamW", + # scheduler=scheduler, + learning_rate=0.001, + weight_decay=0.1, + eps=1e-8, + warmup_steps=0, + batch_size=1, + num_steps=10000, + block_size=512 +) + +# define the hivemind strategy +initial_peers = hparams.get("initial_peers", []) +strategy = HivemindStrategy( + run_id=f"hivetrain-z", + batch_size=1, + target_batch_size=256, + initial_peers=initial_peers, + use_ipfs=True, + use_relay=True, + use_auto_relay=True, + verbose=True, + wait_timeout=30, + bootstrap_timeout=20, + matchmaking_time=45.0, + averaging_timeout=180.0, + delay_state_averaging=True, + delay_grad_averaging=True, + delay_optimizer_step=True, + offload_optimizer=True, + reuse_grad_buffers=False, + # grad_compression=Float16Compression(), + # state_averaging_compression=Float16Compression(), + # load_state_compression=NoCompression(), + scheduler_fn=partial(torch.optim.lr_scheduler.ExponentialLR, gamma=0.9999), +) + +# define training params +train_params = dict( + accelerator="auto", + strategy=strategy, + devices="auto", + max_steps=10000, + max_epochs=-1, + reload_dataloaders_every_n_epochs=1, + precision="32-true", + accumulate_grad_batches=1, # must be 1 for Hivemind training + gradient_clip_val=1.0, + gradient_clip_algorithm="norm", + benchmark=True, + # callbacks=callbacks, + # logger=loggers if loggers else False, +) + +# set weights as trainable +def get_params(model, hparams): + no_decay = ["bias", "LayerNorm.weight"] + grouped_parameters = [] + + for n, p in model.named_parameters(): + if not p.requires_grad: + continue + + if any(nd in n for nd in no_decay): + weight_decay = 0.0 + else: + weight_decay = hparams["weight_decay"] + + grouped_parameters.append( + { + "params": [p], + "weight_decay": weight_decay, + } + ) + + return grouped_parameters + +# set model parameters as trainable +params = get_params(model, hparams) + +# create the optimizer +optimizer = AdamW( + params, + lr=hparams.get("learning_rate", 0.001), + eps=hparams.get("eps", 1e-8), +) + +# Wrap the model in a pytorch-lightning module +train_model = MinerTrainer( + model, + optimizer, + tokenizer, + hparams +) + +# fit the trainer and run +model.train() +trainer = Trainer(**train_params) +trainer.fit( + train_model, + dataset +) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index dcdba66..7f2e978 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ bitarray==2.9.2 datasets transformers hivemind -wandb \ No newline at end of file +wandb +lightning>=2.1.0 From e710776c43f9c65b353971aff007cb8426af3463 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 21:40:44 -0600 Subject: [PATCH 05/21] make more flexible --- .gitignore | 3 +- entrypoint.sh | 2 +- example.env | 5 +- neurons/{minerz.py => miner-z.py} | 162 +++++++++++++++++++++++++----- 4 files changed, 145 insertions(+), 27 deletions(-) rename neurons/{minerz.py => miner-z.py} (62%) diff --git a/.gitignore b/.gitignore index eee0255..ab1c631 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,5 @@ bittensor-subnet-template/ wandb/ .vscode/ -wallets \ No newline at end of file +wallets +lightning_logs \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh index a401f69..10eb0a1 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,7 +2,7 @@ cd /app/neurons -python3 minerz.py +python3 miner-z.py --initial_peers ${INITIAL_PEERS} # python3 miner.py \ # --netuid ${NETUID} \ diff --git a/example.env b/example.env index 744e6fc..fe2f4c0 100644 --- a/example.env +++ b/example.env @@ -3,4 +3,7 @@ WALLETNAME='test' WALLETHOTKEY='test' WANDB_API_KEY='' DHTPORT=42316 -EXTERNALIP=104.202.156.242 \ No newline at end of file +EXTERNALIP=104.202.156.242 + +CUDA_VISIBLE_DEVICES=0 +INITIAL_PEERS="/p2p/12D3KooWQUSXQuThvdTtU59U2zeT6v1Q5m1coFhnnjNvd1g1bpYf" \ No newline at end of file diff --git a/neurons/minerz.py b/neurons/miner-z.py similarity index 62% rename from neurons/minerz.py rename to neurons/miner-z.py index c0acca5..d36abb1 100644 --- a/neurons/minerz.py +++ b/neurons/miner-z.py @@ -1,3 +1,5 @@ +import os +import sys import random import numpy as np import torch @@ -14,9 +16,12 @@ from lightning.pytorch.trainer import Trainer from lightning.pytorch.utilities import CombinedLoader from lightning.pytorch import LightningModule - +from lightning.pytorch.callbacks import Callback, ProgressBar, TQDMProgressBar from lightning_hivemind.strategy import HivemindStrategy from functools import partial +from tqdm.auto import tqdm +import psutil +from math import isnan from transformers import ( AutoConfig, @@ -25,14 +30,28 @@ GenerationConfig, ) from datasets import load_dataset +import ipaddress +import re +import logging +import argparse + +logging.getLogger("lightning.pytorch").setLevel(logging.INFO) + +# capture arguments passed to this python script +parser = argparse.ArgumentParser(description="Get configs from arguments to this script.") +parser.add_argument('--initial_peers', type=list, help='Your peer. Use --initial_peers multiple times to pass multiple peers.', default=[], nargs='?') +args = parser.parse_args() + +initial_peers = args.initial_peers # initialized and load the model +block_size = 1024 config = AutoConfig.from_pretrained( "gpt2", n_emb=256, n_layer=3, n_head=3, - n_inner=3, + n_inner=1024, torch_dtype=torch.float32 ) model = AutoModelForCausalLM.from_config(config) @@ -47,6 +66,13 @@ ) tokenizer.pad_token = tokenizer.eos_token +dataset_config = { + "dataset": "tiiuae/falcon-refinedweb", + "key": "content", + "split": "train", + "block_size": block_size, +} + # create a datamodule to wrap our remote datasets class StreamingDataModule(LightningDataModule): def __init__(self, tokenizer, config): @@ -54,14 +80,10 @@ def __init__(self, tokenizer, config): self.tokenizer = tokenizer self.config = config self.train_data = None - # self.setup(self, self.config, stage=None) self.train_data = StreamingDataset( self.tokenizer, config ) - # def setup(self, config, stage=None): - - def train_dataloader(self): return DataLoader( self.train_data, @@ -87,7 +109,7 @@ def __iter__(self): buffer_size=1000, ) - block_size = 512 + block_size = self.config.get("block_size", 512) batch = [] for document in shuffled: @@ -115,11 +137,7 @@ def __iter__(self): continue # prepare a dataset for use with training -dataset = StreamingDataModule(tokenizer, { - "dataset": "tiiuae/falcon-refinedweb", - "key": "content", - "split": "train", -}) +dataset = StreamingDataModule(tokenizer, dataset_config) # wrap the LightningModule in a custom class class MinerTrainer(LightningModule): @@ -136,6 +154,7 @@ def __init__(self, model, optimizer, tokenizer, hparams): tokenizer, ) self.automatic_optimization = True + self.pbar = None self.save_hyperparameters(hparams) def forward(self, inputs): @@ -144,6 +163,9 @@ def forward(self, inputs): def training_step(self, batch, batch_idx): outputs = self({"input_ids": batch, "labels": batch}) loss = outputs[0] + self.log( + "train_loss", float(loss), on_step=True, on_epoch=False, sync_dist=True + ) return loss def on_train_batch_end(self, trainer, lm, outputs): @@ -155,40 +177,37 @@ def on_train_batch_end(self, trainer, lm, outputs): elif hasattr(self.trainer.strategy.optimizers[0], "local_epoch"): step = self.trainer.strategy.optimizers[0].local_epoch + self.log("step", int(step), on_step=True, on_epoch=False, sync_dist=True) + if hasattr(schedule, "step"): schedule.step() def configure_optimizers(self): "Create optimizer and scheduler" - - # if self.scheduler: - # return [self.optimizer], [self.scheduler()] return [self.optimizer] # define the model hyperparameters hparams = dict( - # optimizer="AdamW", - # scheduler=scheduler, learning_rate=0.001, weight_decay=0.1, eps=1e-8, warmup_steps=0, batch_size=1, num_steps=10000, - block_size=512 + target_batch_size=512, + block_size=block_size ) # define the hivemind strategy -initial_peers = hparams.get("initial_peers", []) strategy = HivemindStrategy( run_id=f"hivetrain-z", batch_size=1, - target_batch_size=256, + target_batch_size=hparams.get("num_steps") * hparams.get("target_batch_size"), initial_peers=initial_peers, use_ipfs=True, use_relay=True, use_auto_relay=True, - verbose=True, + verbose=False, wait_timeout=30, bootstrap_timeout=20, matchmaking_time=45.0, @@ -204,12 +223,31 @@ def configure_optimizers(self): scheduler_fn=partial(torch.optim.lr_scheduler.ExponentialLR, gamma=0.9999), ) +# print my peer id to console +visible_addresses = [ + str(a) + for a in strategy.dht.get_visible_maddrs() + if not ipaddress.ip_address(a.values()[0]).is_loopback +] + +my_ids = [] +pattern = r"(/p2p/.*)" +for peer in list(visible_addresses): + match = re.search(pattern, peer) + if match: + my_ids.append(match.group(1)) + +for peer in list(set(my_ids)): + print( + f"PEER-ID: {peer}" + ) + # define training params train_params = dict( accelerator="auto", strategy=strategy, devices="auto", - max_steps=10000, + max_steps=hparams.get("num_steps", 10000), max_epochs=-1, reload_dataloaders_every_n_epochs=1, precision="32-true", @@ -217,8 +255,7 @@ def configure_optimizers(self): gradient_clip_val=1.0, gradient_clip_algorithm="norm", benchmark=True, - # callbacks=callbacks, - # logger=loggers if loggers else False, + callbacks=[] ) # set weights as trainable @@ -254,6 +291,83 @@ def get_params(model, hparams): eps=hparams.get("eps", 1e-8), ) +# for logging progress +class MinerProgressBar(ProgressBar): + """A variant progress bar that works off of steps and prints periodically.""" + + def __init__(self, num_steps): + super().__init__() + self.num_steps = num_steps + self.last_step = 0 + self.prev_avg_loss = None + self.smoothing = 0.01 + + def on_train_start(self, trainer, lm): + super().on_train_start(trainer, lm) + trainer.pbar = tqdm( + # total=trainer.estimated_stepping_batches, + total=self.num_steps, + smoothing=0, + leave=True, + dynamic_ncols=True, + file=sys.stdout, + ) + + def on_train_end(self, trainer, lm): + trainer.pbar.close() + + def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): + super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) + + step = int(trainer.callback_metrics.get("step", -1)) + if step == -1: + return + + current_loss = float(trainer.callback_metrics["train_loss"]) + + current_epoch = trainer.current_epoch + # if lm.train_len > 0: + # current_epoch += batch_idx / lm.train_len + + avg_loss = 0 + if not isnan(current_loss): + avg_loss = self.average_loss( + current_loss, self.prev_avg_loss, self.smoothing + ) + self.prev_avg_loss = avg_loss + + mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( + "SC_PHYS_PAGES" + ) # e.g. 4015976448 + mem_gib = mem_bytes / (1024.0**3) # e.g. 3.74 + + memory = psutil.virtual_memory() + + bar = f"Loss: {avg_loss:.3f}" + + if current_epoch > 0: + bar += f", Epoch: {epoch_string}" + + if hasattr(trainer.strategy, "num_peers"): + bar += f", Peers: {trainer.strategy.num_peers}" + + if trainer.pbar.n != step: + trainer.pbar.update(step - trainer.pbar.n) + + # this is a dumb hack to make TQDM print in Docker + if random.random() < 0.01: + print() + + trainer.pbar.set_description(bar) + + def average_loss(self, current_loss, prev_avg_loss, smoothing): + if prev_avg_loss is None: + return current_loss + else: + return (smoothing * current_loss) + (1 - smoothing) * prev_avg_loss + +train_params["callbacks"].append(MinerProgressBar(hparams.get("num_steps"))) + # Wrap the model in a pytorch-lightning module train_model = MinerTrainer( model, From 450c0feea84c4095194bf92bcf19895d2b56cd8a Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Fri, 8 Mar 2024 21:51:52 -0600 Subject: [PATCH 06/21] fix bad batch calc --- example.env | 2 +- neurons/miner-z.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example.env b/example.env index fe2f4c0..ab2986f 100644 --- a/example.env +++ b/example.env @@ -6,4 +6,4 @@ DHTPORT=42316 EXTERNALIP=104.202.156.242 CUDA_VISIBLE_DEVICES=0 -INITIAL_PEERS="/p2p/12D3KooWQUSXQuThvdTtU59U2zeT6v1Q5m1coFhnnjNvd1g1bpYf" \ No newline at end of file +INITIAL_PEERS="/p2p/12D3KooWCvMCCJDHQ7d9pfqqkxAPD6AZdAbcXPd1d9pWvQWDpqBi" \ No newline at end of file diff --git a/neurons/miner-z.py b/neurons/miner-z.py index d36abb1..03e2fe8 100644 --- a/neurons/miner-z.py +++ b/neurons/miner-z.py @@ -202,7 +202,7 @@ def configure_optimizers(self): strategy = HivemindStrategy( run_id=f"hivetrain-z", batch_size=1, - target_batch_size=hparams.get("num_steps") * hparams.get("target_batch_size"), + target_batch_size=hparams.get("target_batch_size"), initial_peers=initial_peers, use_ipfs=True, use_relay=True, @@ -247,7 +247,7 @@ def configure_optimizers(self): accelerator="auto", strategy=strategy, devices="auto", - max_steps=hparams.get("num_steps", 10000), + max_steps=hparams.get("num_steps", 10000) * hparams["target_batch_size"], max_epochs=-1, reload_dataloaders_every_n_epochs=1, precision="32-true", From f721fc671ee2a54e5b9286e6db39a391e27cd558 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 12:36:39 -0600 Subject: [PATCH 07/21] clean up a lot of the code --- .gitignore | 3 +- entrypoint.sh | 19 +----- neurons/miner-z.py | 159 +++++++++++++++++++++++++++------------------ 3 files changed, 99 insertions(+), 82 deletions(-) diff --git a/.gitignore b/.gitignore index ab1c631..ba1fe7d 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,5 @@ wandb/ .vscode/ wallets -lightning_logs \ No newline at end of file +lightning_logs +.scale_batch_size* \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh index 10eb0a1..debb303 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,19 +2,6 @@ cd /app/neurons -python3 miner-z.py --initial_peers ${INITIAL_PEERS} - -# python3 miner.py \ -# --netuid ${NETUID} \ -# --wallet.name ${WALLETNAME} \ -# --wallet.hotkey ${WALLETHOTKEY} \ -# --dht.port ${DHTPORT} \ -# --dht.announce_ip ${EXTERNALIP} \ -# --axon.port ${AXONPORT} \ -# --axon.external_ip ${EXTERNALIP} - -# while [ true ] -# do -# echo "I'm dead." -# sleep 5 -# done \ No newline at end of file +python3 miner-z.py \ + --initial_peers ${INITIAL_PEERS} \ + --batch_size ${BATCH_SIZE} \ No newline at end of file diff --git a/neurons/miner-z.py b/neurons/miner-z.py index 03e2fe8..2024c85 100644 --- a/neurons/miner-z.py +++ b/neurons/miner-z.py @@ -1,62 +1,106 @@ +import argparse +import ipaddress +import logging import os -import sys import random +import re +import sys +from functools import partial +from math import isnan + import numpy as np +import psutil import torch -from torch.utils.data import DataLoader, Dataset, IterableDataset -from torch.optim import AdamW +from datasets import load_dataset from lightning.fabric.utilities.seed import reset_seed, seed_everything +from lightning.pytorch import LightningModule from lightning.pytorch.accelerators import TPUAccelerator -from lightning.pytorch.core.datamodule import LightningDataModule from lightning.pytorch.callbacks import ( + Callback, ModelCheckpoint, ModelPruning, + ProgressBar, StochasticWeightAveraging, + TQDMProgressBar, ) +from lightning.pytorch.core.datamodule import LightningDataModule from lightning.pytorch.trainer import Trainer from lightning.pytorch.utilities import CombinedLoader -from lightning.pytorch import LightningModule -from lightning.pytorch.callbacks import Callback, ProgressBar, TQDMProgressBar from lightning_hivemind.strategy import HivemindStrategy -from functools import partial +from torch.optim import AdamW +from torch.utils.data import DataLoader, Dataset, IterableDataset from tqdm.auto import tqdm -import psutil -from math import isnan - from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, ) -from datasets import load_dataset -import ipaddress -import re -import logging -import argparse logging.getLogger("lightning.pytorch").setLevel(logging.INFO) # capture arguments passed to this python script -parser = argparse.ArgumentParser(description="Get configs from arguments to this script.") -parser.add_argument('--initial_peers', type=list, help='Your peer. Use --initial_peers multiple times to pass multiple peers.', default=[], nargs='?') +parser = argparse.ArgumentParser( + description="Get configs from arguments to this script." +) +parser.add_argument( + "--initial_peers", + type=list, + help="Your peer. Use --initial_peers multiple times to pass multiple peers.", + default=[], + nargs="?", +) + +parser.add_argument( + "--batch_size", + type=int, + help="The largest batch size able to fit on your GPU.", + default=1, + nargs="?", +) + args = parser.parse_args() +# set some basic configuration values initial_peers = args.initial_peers +batch_size = args.batch_size +block_size = 1024 +num_steps = 100_000 +target_batch_size = 8192 + +dataset_config = { + "dataset": "tiiuae/falcon-refinedweb", + "key": "content", + "split": "train", + "block_size": block_size, +} # initialized and load the model -block_size = 1024 config = AutoConfig.from_pretrained( "gpt2", - n_emb=256, - n_layer=3, - n_head=3, - n_inner=1024, - torch_dtype=torch.float32 + n_emb=block_size, + n_ctx=block_size, + n_layer=16, + n_head=16, + n_positions=block_size, + n_inner=block_size * 4, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + summary_first_dropout=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.05, + summary_type="cls_index", + summary_proj_to_labels=True, + summary_use_proj=True, + torch_dtype=torch.bfloat16, ) + +print(config) + model = AutoModelForCausalLM.from_config(config) tokenizer = AutoTokenizer.from_pretrained( - "openai-community/gpt2", + "openai-community/gpt2", cache_dir="/tmp/tokenizer", padding="max_length", padding_side="left", @@ -66,12 +110,6 @@ ) tokenizer.pad_token = tokenizer.eos_token -dataset_config = { - "dataset": "tiiuae/falcon-refinedweb", - "key": "content", - "split": "train", - "block_size": block_size, -} # create a datamodule to wrap our remote datasets class StreamingDataModule(LightningDataModule): @@ -79,34 +117,32 @@ def __init__(self, tokenizer, config): super().__init__() self.tokenizer = tokenizer self.config = config - self.train_data = None - self.train_data = StreamingDataset( - self.tokenizer, config - ) + self.train_data = StreamingDataset(self.tokenizer, config) def train_dataloader(self): return DataLoader( self.train_data, - batch_size=1, + batch_size=batch_size, pin_memory=True, num_workers=2, ) + class StreamingDataset(IterableDataset): - def __init__(self, tokenizer, conf): + def __init__(self, tokenizer, config): self.tokenizer = tokenizer - self.config = conf + self.config = config self.dataset = load_dataset( self.config.get("dataset", "tiiuae/falcon-refinedweb"), split=self.config.get("split", "train"), streaming=True, - cache_dir="/tmp/pile" + cache_dir="/tmp/pile", ) def __iter__(self): shuffled = self.dataset.shuffle( seed=random.randint(0, 2**31), - buffer_size=1000, + buffer_size=10000, ) block_size = self.config.get("block_size", 512) @@ -136,23 +172,21 @@ def __iter__(self): else: continue + # prepare a dataset for use with training dataset = StreamingDataModule(tokenizer, dataset_config) + # wrap the LightningModule in a custom class class MinerTrainer(LightningModule): """ A training module for AIGen. """ - def __init__(self, model, optimizer, tokenizer, hparams): + def __init__(self, model, optimizer, hparams): super(MinerTrainer, self).__init__() - self.model, self.optimizer, self.tokenizer = ( - model, - optimizer, - tokenizer, - ) + self.model, self.optimizer = (model, optimizer) self.automatic_optimization = True self.pbar = None self.save_hyperparameters(hparams) @@ -186,22 +220,23 @@ def configure_optimizers(self): "Create optimizer and scheduler" return [self.optimizer] + # define the model hyperparameters hparams = dict( learning_rate=0.001, weight_decay=0.1, eps=1e-8, warmup_steps=0, - batch_size=1, - num_steps=10000, - target_batch_size=512, - block_size=block_size + batch_size=batch_size, + num_steps=num_steps, + target_batch_size=target_batch_size, + block_size=block_size, ) # define the hivemind strategy strategy = HivemindStrategy( run_id=f"hivetrain-z", - batch_size=1, + batch_size=batch_size, target_batch_size=hparams.get("target_batch_size"), initial_peers=initial_peers, use_ipfs=True, @@ -238,9 +273,7 @@ def configure_optimizers(self): my_ids.append(match.group(1)) for peer in list(set(my_ids)): - print( - f"PEER-ID: {peer}" - ) + print(f"PEER-ID: {peer}") # define training params train_params = dict( @@ -251,13 +284,14 @@ def configure_optimizers(self): max_epochs=-1, reload_dataloaders_every_n_epochs=1, precision="32-true", - accumulate_grad_batches=1, # must be 1 for Hivemind training + accumulate_grad_batches=1, # must be 1 for Hivemind training gradient_clip_val=1.0, gradient_clip_algorithm="norm", benchmark=True, - callbacks=[] + callbacks=[], ) + # set weights as trainable def get_params(model, hparams): no_decay = ["bias", "LayerNorm.weight"] @@ -281,6 +315,7 @@ def get_params(model, hparams): return grouped_parameters + # set model parameters as trainable params = get_params(model, hparams) @@ -291,6 +326,7 @@ def get_params(model, hparams): eps=hparams.get("eps", 1e-8), ) + # for logging progress class MinerProgressBar(ProgressBar): """A variant progress bar that works off of steps and prints periodically.""" @@ -366,20 +402,13 @@ def average_loss(self, current_loss, prev_avg_loss, smoothing): else: return (smoothing * current_loss) + (1 - smoothing) * prev_avg_loss + train_params["callbacks"].append(MinerProgressBar(hparams.get("num_steps"))) # Wrap the model in a pytorch-lightning module -train_model = MinerTrainer( - model, - optimizer, - tokenizer, - hparams -) +train_model = MinerTrainer(model, optimizer, hparams) # fit the trainer and run model.train() trainer = Trainer(**train_params) -trainer.fit( - train_model, - dataset -) \ No newline at end of file +trainer.fit(train_model, dataset) From e20cb581f42c40153d0a121d947c8ff648042bd2 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 13:53:45 -0600 Subject: [PATCH 08/21] fix logging, abstract config vars --- entrypoint.sh | 2 +- neurons/{miner-z.py => hiveminer.py} | 94 +++++++--------------------- 2 files changed, 25 insertions(+), 71 deletions(-) rename neurons/{miner-z.py => hiveminer.py} (80%) diff --git a/entrypoint.sh b/entrypoint.sh index debb303..045468b 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,6 +2,6 @@ cd /app/neurons -python3 miner-z.py \ +python3 hiveminer.py \ --initial_peers ${INITIAL_PEERS} \ --batch_size ${BATCH_SIZE} \ No newline at end of file diff --git a/neurons/miner-z.py b/neurons/hiveminer.py similarity index 80% rename from neurons/miner-z.py rename to neurons/hiveminer.py index 2024c85..557f63d 100644 --- a/neurons/miner-z.py +++ b/neurons/hiveminer.py @@ -9,27 +9,18 @@ from math import isnan import numpy as np -import psutil import torch from datasets import load_dataset from lightning.fabric.utilities.seed import reset_seed, seed_everything from lightning.pytorch import LightningModule from lightning.pytorch.accelerators import TPUAccelerator -from lightning.pytorch.callbacks import ( - Callback, - ModelCheckpoint, - ModelPruning, - ProgressBar, - StochasticWeightAveraging, - TQDMProgressBar, -) +from lightning.pytorch.callbacks import Callback from lightning.pytorch.core.datamodule import LightningDataModule from lightning.pytorch.trainer import Trainer from lightning.pytorch.utilities import CombinedLoader from lightning_hivemind.strategy import HivemindStrategy from torch.optim import AdamW from torch.utils.data import DataLoader, Dataset, IterableDataset -from tqdm.auto import tqdm from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -66,7 +57,7 @@ batch_size = args.batch_size block_size = 1024 num_steps = 100_000 -target_batch_size = 8192 +target_batch_size = 256 dataset_config = { "dataset": "tiiuae/falcon-refinedweb", @@ -188,7 +179,6 @@ def __init__(self, model, optimizer, hparams): self.model, self.optimizer = (model, optimizer) self.automatic_optimization = True - self.pbar = None self.save_hyperparameters(hparams) def forward(self, inputs): @@ -203,18 +193,13 @@ def training_step(self, batch, batch_idx): return loss def on_train_batch_end(self, trainer, lm, outputs): - schedule = self.lr_schedulers() - step = self.global_step - - if hasattr(schedule, "current_step"): - step = schedule.current_step - elif hasattr(self.trainer.strategy.optimizers[0], "local_epoch"): - step = self.trainer.strategy.optimizers[0].local_epoch - - self.log("step", int(step), on_step=True, on_epoch=False, sync_dist=True) - - if hasattr(schedule, "step"): - schedule.step() + self.log( + "step", + int(self.trainer.current_epoch), + on_step=True, + on_epoch=False, + sync_dist=True, + ) def configure_optimizers(self): "Create optimizer and scheduler" @@ -229,7 +214,6 @@ def configure_optimizers(self): warmup_steps=0, batch_size=batch_size, num_steps=num_steps, - target_batch_size=target_batch_size, block_size=block_size, ) @@ -237,7 +221,7 @@ def configure_optimizers(self): strategy = HivemindStrategy( run_id=f"hivetrain-z", batch_size=batch_size, - target_batch_size=hparams.get("target_batch_size"), + target_batch_size=target_batch_size, initial_peers=initial_peers, use_ipfs=True, use_relay=True, @@ -280,7 +264,7 @@ def configure_optimizers(self): accelerator="auto", strategy=strategy, devices="auto", - max_steps=hparams.get("num_steps", 10000) * hparams["target_batch_size"], + max_steps=num_steps * target_batch_size, max_epochs=-1, reload_dataloaders_every_n_epochs=1, precision="32-true", @@ -288,12 +272,13 @@ def configure_optimizers(self): gradient_clip_val=1.0, gradient_clip_algorithm="norm", benchmark=True, + enable_progress_bar=False, callbacks=[], ) # set weights as trainable -def get_params(model, hparams): +def set_trainable_parameters(model, hparams): no_decay = ["bias", "LayerNorm.weight"] grouped_parameters = [] @@ -317,7 +302,7 @@ def get_params(model, hparams): # set model parameters as trainable -params = get_params(model, hparams) +params = set_trainable_parameters(model, hparams) # create the optimizer optimizer = AdamW( @@ -328,30 +313,17 @@ def get_params(model, hparams): # for logging progress -class MinerProgressBar(ProgressBar): +class MinerConsoleLogging(Callback): """A variant progress bar that works off of steps and prints periodically.""" def __init__(self, num_steps): super().__init__() self.num_steps = num_steps - self.last_step = 0 + self.num_peers = 0 + self.previous_step = None self.prev_avg_loss = None self.smoothing = 0.01 - def on_train_start(self, trainer, lm): - super().on_train_start(trainer, lm) - trainer.pbar = tqdm( - # total=trainer.estimated_stepping_batches, - total=self.num_steps, - smoothing=0, - leave=True, - dynamic_ncols=True, - file=sys.stdout, - ) - - def on_train_end(self, trainer, lm): - trainer.pbar.close() - def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) @@ -361,10 +333,6 @@ def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): current_loss = float(trainer.callback_metrics["train_loss"]) - current_epoch = trainer.current_epoch - # if lm.train_len > 0: - # current_epoch += batch_idx / lm.train_len - avg_loss = 0 if not isnan(current_loss): avg_loss = self.average_loss( @@ -372,29 +340,15 @@ def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): ) self.prev_avg_loss = avg_loss - mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( - "SC_PHYS_PAGES" - ) # e.g. 4015976448 - mem_gib = mem_bytes / (1024.0**3) # e.g. 3.74 - - memory = psutil.virtual_memory() - - bar = f"Loss: {avg_loss:.3f}" - - if current_epoch > 0: - bar += f", Epoch: {epoch_string}" + output = f"Global Step: {str(step)}, Local Loss: {avg_loss:.3f}" if hasattr(trainer.strategy, "num_peers"): - bar += f", Peers: {trainer.strategy.num_peers}" - - if trainer.pbar.n != step: - trainer.pbar.update(step - trainer.pbar.n) - - # this is a dumb hack to make TQDM print in Docker - if random.random() < 0.01: - print() + output += f", Peers: {self.num_peers}" - trainer.pbar.set_description(bar) + if step != self.previous_step or self.num_peers != trainer.strategy.num_peers: + print(output) + self.previous_step = step + self.num_peers = trainer.strategy.num_peers def average_loss(self, current_loss, prev_avg_loss, smoothing): if prev_avg_loss is None: @@ -403,7 +357,7 @@ def average_loss(self, current_loss, prev_avg_loss, smoothing): return (smoothing * current_loss) + (1 - smoothing) * prev_avg_loss -train_params["callbacks"].append(MinerProgressBar(hparams.get("num_steps"))) +train_params["callbacks"].append(MinerConsoleLogging(hparams.get("num_steps"))) # Wrap the model in a pytorch-lightning module train_model = MinerTrainer(model, optimizer, hparams) From 07b99d772a979a1e985647386252770eb3d1cc36 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 14:20:33 -0600 Subject: [PATCH 09/21] save --- neurons/hiveminer.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 557f63d..fceb776 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -71,7 +71,7 @@ "gpt2", n_emb=block_size, n_ctx=block_size, - n_layer=16, + n_layer=6, n_head=16, n_positions=block_size, n_inner=block_size * 4, @@ -322,7 +322,6 @@ def __init__(self, num_steps): self.num_peers = 0 self.previous_step = None self.prev_avg_loss = None - self.smoothing = 0.01 def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) @@ -335,22 +334,20 @@ def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): avg_loss = 0 if not isnan(current_loss): - avg_loss = self.average_loss( - current_loss, self.prev_avg_loss, self.smoothing - ) + avg_loss = self.average_loss(current_loss, self.prev_avg_loss) self.prev_avg_loss = avg_loss output = f"Global Step: {str(step)}, Local Loss: {avg_loss:.3f}" if hasattr(trainer.strategy, "num_peers"): - output += f", Peers: {self.num_peers}" + output += f", Peers: {trainer.strategy.num_peers}" if step != self.previous_step or self.num_peers != trainer.strategy.num_peers: print(output) self.previous_step = step self.num_peers = trainer.strategy.num_peers - def average_loss(self, current_loss, prev_avg_loss, smoothing): + def average_loss(self, current_loss, prev_avg_loss, smoothing=0.01): if prev_avg_loss is None: return current_loss else: From 25b2e91a7376bb69c9ac8af3b4bcc3a4529a5125 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 14:25:04 -0600 Subject: [PATCH 10/21] nits --- neurons/hiveminer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index fceb776..41cd677 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -13,11 +13,9 @@ from datasets import load_dataset from lightning.fabric.utilities.seed import reset_seed, seed_everything from lightning.pytorch import LightningModule -from lightning.pytorch.accelerators import TPUAccelerator from lightning.pytorch.callbacks import Callback from lightning.pytorch.core.datamodule import LightningDataModule from lightning.pytorch.trainer import Trainer -from lightning.pytorch.utilities import CombinedLoader from lightning_hivemind.strategy import HivemindStrategy from torch.optim import AdamW from torch.utils.data import DataLoader, Dataset, IterableDataset @@ -57,7 +55,7 @@ batch_size = args.batch_size block_size = 1024 num_steps = 100_000 -target_batch_size = 256 +target_batch_size = 8 dataset_config = { "dataset": "tiiuae/falcon-refinedweb", From db0003d561efe1ba96b912a64a8cae560b1996ad Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 16:11:16 -0600 Subject: [PATCH 11/21] fix global epoch issue --- neurons/hiveminer.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 41cd677..9287433 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -32,6 +32,7 @@ parser = argparse.ArgumentParser( description="Get configs from arguments to this script." ) + parser.add_argument( "--initial_peers", type=list, @@ -45,6 +46,7 @@ type=int, help="The largest batch size able to fit on your GPU.", default=1, + const=1, nargs="?", ) @@ -55,7 +57,7 @@ batch_size = args.batch_size block_size = 1024 num_steps = 100_000 -target_batch_size = 8 +target_batch_size = 256 dataset_config = { "dataset": "tiiuae/falcon-refinedweb", @@ -70,7 +72,7 @@ n_emb=block_size, n_ctx=block_size, n_layer=6, - n_head=16, + n_head=6, n_positions=block_size, n_inner=block_size * 4, resid_pdrop=0.1, @@ -117,6 +119,7 @@ def train_dataloader(self): ) +# create an iterable dataset, which loops over the streaming data class StreamingDataset(IterableDataset): def __init__(self, tokenizer, config): self.tokenizer = tokenizer @@ -190,10 +193,10 @@ def training_step(self, batch, batch_idx): ) return loss - def on_train_batch_end(self, trainer, lm, outputs): + def on_train_batch_end(self, trainer, outputs, idx): self.log( "step", - int(self.trainer.current_epoch), + int(self.trainer.strategy.optimizers[0].local_epoch), on_step=True, on_epoch=False, sync_dist=True, @@ -209,7 +212,7 @@ def configure_optimizers(self): learning_rate=0.001, weight_decay=0.1, eps=1e-8, - warmup_steps=0, + warmup_steps=10, batch_size=batch_size, num_steps=num_steps, block_size=block_size, @@ -217,7 +220,7 @@ def configure_optimizers(self): # define the hivemind strategy strategy = HivemindStrategy( - run_id=f"hivetrain-z", + run_id=f"hiveminer", batch_size=batch_size, target_batch_size=target_batch_size, initial_peers=initial_peers, @@ -323,7 +326,6 @@ def __init__(self, num_steps): def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) - step = int(trainer.callback_metrics.get("step", -1)) if step == -1: return From ba50f250bb57382d95122be3076080a4d2ec9784 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 16:24:45 -0600 Subject: [PATCH 12/21] fix argparse --- neurons/hiveminer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 9287433..364de24 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -35,7 +35,7 @@ parser.add_argument( "--initial_peers", - type=list, + action="append", help="Your peer. Use --initial_peers multiple times to pass multiple peers.", default=[], nargs="?", @@ -69,7 +69,7 @@ # initialized and load the model config = AutoConfig.from_pretrained( "gpt2", - n_emb=block_size, + n_embd=block_size, n_ctx=block_size, n_layer=6, n_head=6, From 2a63f9fa6474bf189bbba7ec00de7d67b6b3d684 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 16:25:40 -0600 Subject: [PATCH 13/21] fix model args --- neurons/hiveminer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 364de24..15d7d59 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -71,8 +71,8 @@ "gpt2", n_embd=block_size, n_ctx=block_size, - n_layer=6, - n_head=6, + n_layer=8, + n_head=8, n_positions=block_size, n_inner=block_size * 4, resid_pdrop=0.1, From e0a137a8f2dc155d4860097d4f6069fc937ea325 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 16:35:26 -0600 Subject: [PATCH 14/21] fix argparse again --- neurons/hiveminer.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 15d7d59..2b074c0 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -36,9 +36,9 @@ parser.add_argument( "--initial_peers", action="append", - help="Your peer. Use --initial_peers multiple times to pass multiple peers.", + help="Add a peer. Can be used multiple times to pass multiple peers.", + nargs="*", default=[], - nargs="?", ) parser.add_argument( @@ -52,8 +52,18 @@ args = parser.parse_args() + +def flatten_list(nested_list): + """Flatten a nested list.""" + if nested_list and isinstance(nested_list[0], list): + # Assumes only one level of nesting + return [item for sublist in nested_list for item in sublist] + return nested_list + + # set some basic configuration values -initial_peers = args.initial_peers +initial_peers = flatten_list(args.initial_peers) + batch_size = args.batch_size block_size = 1024 num_steps = 100_000 From 63f26a24eda94dac83adeaed0ccc701377f195f5 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 16:50:59 -0600 Subject: [PATCH 15/21] change hparams --- neurons/hiveminer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 2b074c0..f46b921 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -67,7 +67,7 @@ def flatten_list(nested_list): batch_size = args.batch_size block_size = 1024 num_steps = 100_000 -target_batch_size = 256 +target_batch_size = 8192 dataset_config = { "dataset": "tiiuae/falcon-refinedweb", From 34b1e7f480f11f7cc855031ea4f502471e39ae54 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 17:10:01 -0600 Subject: [PATCH 16/21] new run --- compose.yml | 1 + neurons/hiveminer.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/compose.yml b/compose.yml index 8ae2c92..3c276a1 100644 --- a/compose.yml +++ b/compose.yml @@ -10,6 +10,7 @@ services: tty: true stdin_open: true build: + shm_size: '4gb' dockerfile: Dockerfile volumes: - ./neurons:/app/neurons diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index f46b921..73d25a1 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -63,6 +63,7 @@ def flatten_list(nested_list): # set some basic configuration values initial_peers = flatten_list(args.initial_peers) +print(initial_peers) batch_size = args.batch_size block_size = 1024 @@ -84,7 +85,7 @@ def flatten_list(nested_list): n_layer=8, n_head=8, n_positions=block_size, - n_inner=block_size * 4, + n_inner=block_size * 6, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, @@ -240,8 +241,8 @@ def configure_optimizers(self): verbose=False, wait_timeout=30, bootstrap_timeout=20, - matchmaking_time=45.0, - averaging_timeout=180.0, + matchmaking_time=60.0, + averaging_timeout=300.0, delay_state_averaging=True, delay_grad_averaging=True, delay_optimizer_step=True, From f282bf5c9154208400535de63a1c181c3d26398d Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sat, 9 Mar 2024 17:30:12 -0600 Subject: [PATCH 17/21] add instructions --- DOCKER.md | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 DOCKER.md diff --git a/DOCKER.md b/DOCKER.md new file mode 100644 index 0000000..5879882 --- /dev/null +++ b/DOCKER.md @@ -0,0 +1,75 @@ +# how to use hivetrain for docker + +## install dependencies + +1. [Docker](https://docs.docker.com/engine/install/) +2. [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + +## clone the repo +``` +git clone https://github.com/LuciferianInk/DistributedTraining.git +``` + +## move into the repo +``` +cd DistributedTraining +``` + +## checkout the dev branch +``` +git checkout docker-setup +``` + +## build the docker image +``` +docker compose build +``` + +## make a .env file +Make a file call `.env`, and place it in the root of this project. + +## make a choice +At this point, you must make one of two choices: + +### 1. bootstrap +If you intend to bootstrap a new training run. +``` +docker compose up +``` + +### 2. join +If you intend to join an existing training run, then add this environment variable to your `.env` file: +``` +INITIAL_PEERS="/p2p/12D3KooWE1fyvZHhuc2UQqAN35oXgexHKRpVqgXKo9EUQ4hguny9" +``` +After that, you may join the training run with: +``` +docker compose up +``` + +## final notes + +Your machine will print your own peer ID to the console at startup. It should look like this: +``` +PEER-ID: /p2p/12D3KooWF9KB7PVUdbct4ryCMzDjbNT1q2w5XMw9iVG6tisY4ThB +``` +If Hivemind is under-utilizing your GPU (i.e. it's not using all of your available VRAM), you may try to increase the batch size being used. To do this, add this environment variable to your `.env` file: +``` +BATCH_SIZE=2 (or 3, or whatever) +``` +You will know that training is progressing when you see output like this: +``` +hivetrain-1 | LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] +hivetrain-1 | +hivetrain-1 | | Name | Type | Params +hivetrain-1 | ------------------------------------------ +hivetrain-1 | 0 | model | GPT2LMHeadModel | 186 M +hivetrain-1 | ------------------------------------------ +hivetrain-1 | 186 M Trainable params +hivetrain-1 | 0 Non-trainable params +hivetrain-1 | 186 M Total params +hivetrain-1 | 747.418 Total estimated model params size (MB) +hivetrain-1 | Global Step: 0, Local Loss: 12.069, Peers: 0 +hivetrain-1 | Global Step: 0, Local Loss: 12.063, Peers: 1 +hivetrain-1 | Global Step: 0, Local Loss: 11.852, Peers: 2 +``` \ No newline at end of file From a3ef9c00c195f35a0f390561c0f026a0c00a2509 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sun, 10 Mar 2024 09:51:17 -0500 Subject: [PATCH 18/21] typo --- DOCKER.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCKER.md b/DOCKER.md index 5879882..f98736e 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -26,7 +26,7 @@ docker compose build ``` ## make a .env file -Make a file call `.env`, and place it in the root of this project. +Make a file called `.env`, and place it in the root of this project. ## make a choice At this point, you must make one of two choices: From 4a41dc926edc3922505217224f125313d1d433a4 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sun, 10 Mar 2024 10:26:31 -0500 Subject: [PATCH 19/21] isolate docker deps, add model saver callback --- .gitignore | 1 + Dockerfile | 7 ++++-- compose.yml | 1 + entrypoint.sh | 3 ++- example.env | 3 ++- neurons/hiveminer.py | 50 +++++++++++++++++++++++++++++++++++++++-- requirements.docker.txt | 2 ++ requirements.txt | 1 - 8 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 requirements.docker.txt diff --git a/.gitignore b/.gitignore index ba1fe7d..280fb24 100644 --- a/.gitignore +++ b/.gitignore @@ -162,6 +162,7 @@ bittensor-subnet-template/ wandb/ .vscode/ +data wallets lightning_logs .scale_batch_size* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 332361d..537250b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,9 +20,12 @@ COPY requirements.txt requirements.txt RUN pip install -r requirements.txt && \ pip cache purge -COPY ./ /app +COPY requirements.docker.txt requirements.docker.txt + +RUN pip install -r requirements.docker.txt && \ + pip cache purge -RUN pip install git+https://github.com/LuciferianInk/lightning-Hivemind.git +COPY ./ /app RUN pip install -e . && \ pip cache purge diff --git a/compose.yml b/compose.yml index 3c276a1..381d598 100644 --- a/compose.yml +++ b/compose.yml @@ -14,6 +14,7 @@ services: dockerfile: Dockerfile volumes: - ./neurons:/app/neurons + - ./data:/data - ./wallets:/root/.bittensor/wallets deploy: resources: diff --git a/entrypoint.sh b/entrypoint.sh index 045468b..10b16e2 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -4,4 +4,5 @@ cd /app/neurons python3 hiveminer.py \ --initial_peers ${INITIAL_PEERS} \ - --batch_size ${BATCH_SIZE} \ No newline at end of file + --batch_size ${BATCH_SIZE} \ + --save_every ${SAVE_EVERY} \ No newline at end of file diff --git a/example.env b/example.env index ab2986f..851c4fd 100644 --- a/example.env +++ b/example.env @@ -6,4 +6,5 @@ DHTPORT=42316 EXTERNALIP=104.202.156.242 CUDA_VISIBLE_DEVICES=0 -INITIAL_PEERS="/p2p/12D3KooWCvMCCJDHQ7d9pfqqkxAPD6AZdAbcXPd1d9pWvQWDpqBi" \ No newline at end of file +INITIAL_PEERS="/p2p/12D3KooWCvMCCJDHQ7d9pfqqkxAPD6AZdAbcXPd1d9pWvQWDpqBi" +SAVE_EVERY=0 \ No newline at end of file diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index 73d25a1..f50a992 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -50,6 +50,15 @@ nargs="?", ) +parser.add_argument( + "--save_every", + type=int, + help="Save the model every X global steps.", + default=0, + const=0, + nargs="?", +) + args = parser.parse_args() @@ -63,9 +72,8 @@ def flatten_list(nested_list): # set some basic configuration values initial_peers = flatten_list(args.initial_peers) -print(initial_peers) - batch_size = args.batch_size +save_every = args.save_every block_size = 1024 num_steps = 100_000 target_batch_size = 8192 @@ -365,7 +373,45 @@ def average_loss(self, current_loss, prev_avg_loss, smoothing=0.01): return (smoothing * current_loss) + (1 - smoothing) * prev_avg_loss +class MinerModelSaver(Callback): + """Periodically save the model during training.""" + + def __init__( + self, + save_every, + output_dir, + ): + super().__init__() + self.step = 0 + self.last_step = 0 + self.save_every = save_every + self.output_dir = output_dir + + @property + def save_every_check(self): + return ( + self.step > 0 + and self.save_every > 0 + and self.last_step != self.step + and self.step % self.save_every == 0 + ) + + def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): + super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) + + self.step = int(trainer.callback_metrics.get("step", 0)) + + if self.save_every_check: + self.save_pytorch_model(trainer, lm) + + self.last_step = self.step + + def save_pytorch_model(self, trainer, lm): + lm.model.save_pretrained(self.output_dir, safe_serialization=True) + + train_params["callbacks"].append(MinerConsoleLogging(hparams.get("num_steps"))) +train_params["callbacks"].append(MinerModelSaver(save_every, "/data")) # Wrap the model in a pytorch-lightning module train_model = MinerTrainer(model, optimizer, hparams) diff --git a/requirements.docker.txt b/requirements.docker.txt new file mode 100644 index 0000000..78b062e --- /dev/null +++ b/requirements.docker.txt @@ -0,0 +1,2 @@ +lightning>=2.1.0 +git+https://github.com/LuciferianInk/lightning-Hivemind.git#egg=lightning-Hivemind \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7f2e978..12377f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ datasets transformers hivemind wandb -lightning>=2.1.0 From 87757128f4008fb57f09923ba00470d62fa0b938 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sun, 10 Mar 2024 11:03:08 -0500 Subject: [PATCH 20/21] integrate new arch libs --- Dockerfile | 10 ++- neurons/hiveminer.py | 146 +++++++++++++++++++++++++++++++++++++--- requirements.docker.txt | 3 +- 3 files changed, 145 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 537250b..d7e9a48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,11 +25,15 @@ COPY requirements.docker.txt requirements.docker.txt RUN pip install -r requirements.docker.txt && \ pip cache purge +# RUN git clone https://github.com/bit-current/NewArchScrapBook.git + +# RUN pip install -i ./NewArchScrapBook + COPY ./ /app -RUN pip install -e . && \ - pip cache purge +# RUN pip install -e . && \ +# pip cache purge -RUN python3 post_install.py +# RUN python3 post_install.py ENTRYPOINT "bash ./entrypoint.sh" \ No newline at end of file diff --git a/neurons/hiveminer.py b/neurons/hiveminer.py index f50a992..6e45447 100644 --- a/neurons/hiveminer.py +++ b/neurons/hiveminer.py @@ -11,6 +11,12 @@ import numpy as np import torch from datasets import load_dataset +from hivetrain.btt_connector import ( + BittensorNetwork, + get_validator_uids_and_addresses, + serve_axon, +) +from hivetrain.config import Configurator from lightning.fabric.utilities.seed import reset_seed, seed_everything from lightning.pytorch import LightningModule from lightning.pytorch.callbacks import Callback @@ -19,12 +25,7 @@ from lightning_hivemind.strategy import HivemindStrategy from torch.optim import AdamW from torch.utils.data import DataLoader, Dataset, IterableDataset -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - GenerationConfig, -) +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer logging.getLogger("lightning.pytorch").setLevel(logging.INFO) @@ -74,7 +75,7 @@ def flatten_list(nested_list): initial_peers = flatten_list(args.initial_peers) batch_size = args.batch_size save_every = args.save_every -block_size = 1024 +block_size = 512 num_steps = 100_000 target_batch_size = 8192 @@ -90,10 +91,10 @@ def flatten_list(nested_list): "gpt2", n_embd=block_size, n_ctx=block_size, - n_layer=8, - n_head=8, + n_layer=2, + n_head=2, n_positions=block_size, - n_inner=block_size * 6, + n_inner=block_size * 4, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, @@ -410,8 +411,133 @@ def save_pytorch_model(self, trainer, lm): lm.model.save_pretrained(self.output_dir, safe_serialization=True) +class ValidationCommunicator(Callback): + """Periodically save the model during training.""" + + def __init__(self, sync_interval=600): + super().__init__() + + BittensorNetwork.initialize(Configurator.combine_configs()) + + # Now you can access wallet, subtensor, and metagraph like this: + self.wallet = BittensorNetwork.wallet + self.subtensor = BittensorNetwork.subtensor + self.metagraph = BittensorNetwork.metagraph + self.step = 0 + self.sync_interval = sync_interval + self.last_sync_time = 0 + self.validator_urls = [] + + def get_validator_uids_and_addresses( + self, metagraph: "bt.metagraph.Metagraph", vpermit_tao_limit: int = 2 + ): + """ + Check availability of all UIDs in a given subnet, returning their IP, port numbers, and hotkeys + if they are serving and have at least vpermit_tao_limit stake, along with a list of strings + formatted as 'ip:port' for each validator. + + Args: + metagraph (bt.metagraph.Metagraph): Metagraph object. + vpermit_tao_limit (int): Validator permit tao limit. + + Returns: + Tuple[List[dict], List[str]]: A tuple where the first element is a list of dicts with details + of available UIDs, including their IP, port, and hotkeys, and the + second element is a list of strings formatted as 'ip:port'. + """ + available_uid_details = [] + validator_addresses = [] # List to hold 'ip:port' strings + for uid in range(len(self.metagraph.S)): + if self.metagraph.S[uid] >= vpermit_tao_limit: + ip = self.metagraph.axons[uid].ip + port = self.metagraph.axons[uid].port + details = { + "uid": uid, + "ip": ip, + "port": port, + "hotkey": self.metagraph.hotkeys[uid], + } + available_uid_details.append(details) + validator_addresses.append( + f"{ip}:{port}" + ) # Format and add 'ip:port' to the list + + return available_uid_details, validator_addresses + + def on_train_batch_end(self, trainer, lm, outputs, batch, batch_idx): + super().on_train_batch_end(trainer, lm, outputs, batch, batch_idx) + + self.step = int(trainer.callback_metrics.get("step", 0)) + + if self.step % 100: + if self.should_sync_metagraph(): + self.resync_metagraph() + _, self.validator_urls = self.get_validator_uids_and_addresses() + message, signature, public_address = self.create_signed_message(timestamp) + + for url in self.validator_urls: + try: + requests.post( + f"http://{url}/validate_metrics", + json={"rank": rank, "checksum": checksum, "metrics": metrics}, + ) + except: + pass # FIXME log sth + requests.post() + + def create_signed_message(self, message): + """Sign a message and return the signature.""" + signature = self.wallet.hotkey.sign( + message + ).hex() # Convert bytes to hex string for easy transmission + public_address = wallet.hotkey.ss58_address + return message, signature, public_address + + def send_metrics(metrics, rank, validator_urls): + timestamp = time.time() + message, signature, public_address = create_signed_message(timestamp) + data = { + "message": message, + "signature": signature, + "public_address": public_address, + "metrics": metrics, + "rank": rank, + } + # Ensure metrics is a dictionary + if not isinstance(metrics, dict): + raise ValueError("Metrics must be provided as a dictionary.") + # Ensure validator_urls is a list + if not isinstance(validator_urls, list): + raise ValueError("validator_urls must be provided as a list.") + + def resync_metagraph(self): + self.metagraph.sync(subtensor=self.subtensor) + + def should_sync_metagraph(self): + """ + Check if enough epoch blocks have elapsed since the last checkpoint to sync. + """ + return (time.time() - self.last_sync_time) > self.sync_interval + # return ( + # self.block - self.metagraph.last_update[self.uid] + # ) > self.config.neuron.epoch_length + + def check_registered(self): + # --- Check for registration. + if not self.subtensor.is_hotkey_registered( + netuid=self.config.netuid, + hotkey_ss58=self.wallet.hotkey.ss58_address, + ): + bt.logging.error( + f"Wallet: {self.wallet} is not registered on netuid {self.config.netuid}." + f" Please register the hotkey using `btcli subnets register` before trying again" + ) + exit() + + train_params["callbacks"].append(MinerConsoleLogging(hparams.get("num_steps"))) train_params["callbacks"].append(MinerModelSaver(save_every, "/data")) +# train_params["callbacks"].append(ValidationCommunicator()) # Wrap the model in a pytorch-lightning module train_model = MinerTrainer(model, optimizer, hparams) diff --git a/requirements.docker.txt b/requirements.docker.txt index 78b062e..9141cc6 100644 --- a/requirements.docker.txt +++ b/requirements.docker.txt @@ -1,2 +1,3 @@ lightning>=2.1.0 -git+https://github.com/LuciferianInk/lightning-Hivemind.git#egg=lightning-Hivemind \ No newline at end of file +git+https://github.com/LuciferianInk/lightning-Hivemind.git#egg=lightning-Hivemind +git+https://github.com/bit-current/NewArchScrapBook.git#egg=hivetrain \ No newline at end of file From 264c617bcece85b0ad0cd9df791c9a480c93588d Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sun, 10 Mar 2024 12:03:31 -0500 Subject: [PATCH 21/21] fix --- Dockerfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index d7e9a48..9aff565 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,15 +25,6 @@ COPY requirements.docker.txt requirements.docker.txt RUN pip install -r requirements.docker.txt && \ pip cache purge -# RUN git clone https://github.com/bit-current/NewArchScrapBook.git - -# RUN pip install -i ./NewArchScrapBook - COPY ./ /app -# RUN pip install -e . && \ -# pip cache purge - -# RUN python3 post_install.py - ENTRYPOINT "bash ./entrypoint.sh" \ No newline at end of file