nopeanuts
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dalle_mini/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎src/dalle_mini/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/dalle_mini/data.py‎
Lines changed: 69 additions & 20 deletions b/‎src/dalle_mini/data.py‎
Lines changed: 69 additions & 20 deletions
diff --git a/‎src/dalle_mini/model/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/dalle_mini/model/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -35,7 +35,6 @@ To generate sample predictions and understand the inference pipeline step by ste
 Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
 Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model with cool prompts!
 
-
 ## Development
 
 ### Dependencies Installation
@@ -95,6 +94,7 @@ Many thanks to the people who helped make it better:
 
 - the [DALLE-Pytorch](https://discord.gg/xBPBXfcFHd) and [EleutherAI](https://www.eleuther.ai/) communities for testing and exchanging cool ideas
 - [Rohan Anil](https://github.com/rohan-anil) for adding Distributed Shampoo optimizer
+- [Katherine Crowson](https://github.com/crowsonkb) for [super conditioning](https://twitter.com/RiversHaveWings/status/1478093658716966912)
 
 ## Citing DALL·E mini
 
 
@@ -1 +1,3 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3"
+
+from .model import DalleBart, DalleBartProcessor
@@ -7,7 +7,7 @@
 from braceexpand import braceexpand
 from datasets import Dataset, load_dataset
 
-from .text import TextNormalizer
+from .model.text import TextNormalizer
 
 
 @dataclass
@@ -28,6 +28,11 @@ class Dataset:
     seed_dataset: int = None
     shard_by_host: bool = False
     blank_caption_prob: float = 0.0
+    clip_score_column: str = "clip_score"
+    min_clip_score: float = None
+    max_clip_score: float = None
+    filter_column: str = None
+    filter_value: str = None
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
@@ -36,6 +41,7 @@ class Dataset:
     def __post_init__(self):
         self.multi_hosts = jax.process_count() > 1
         # feed blank captions only in streaming mode for now
+        # otherwise dataset could be cached with same blanked captions
         if self.blank_caption_prob:
             assert (
                 self.streaming is True
@@ -107,23 +113,30 @@ def preprocess(self, tokenizer, config):
                 self.seed_dataset = np.random.get_state()[1][0]
             self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
 
-        # blank captions
-        if self.blank_caption_prob:
-            partial_blank_caption_function = partial(
-                blank_caption_function,
-                text_column=self.text_column,
-                blank_caption_prob=self.blank_caption_prob,
-            )
-            if hasattr(self, "train_dataset"):
-                self.train_dataset = (
-                    self.train_dataset.map(partial_blank_caption_function)
-                    if self.streaming
-                    else self.train_dataset.map(
-                        partial_blank_caption_function,
-                        num_proc=self.preprocessing_num_workers,
-                        load_from_cache_file=False,
-                        desc="Blanking some captions",
-                    )
+        # filter data
+        partial_filter_function = partial(
+            filter_function,
+            filter_column=self.filter_column,
+            filter_value=self.filter_value,
+            clip_score_column=self.clip_score_column,
+            min_clip_score=self.min_clip_score,
+            max_clip_score=self.max_clip_score,
+        )
+        for ds in ["train_dataset", "eval_dataset"]:
+            if hasattr(self, ds):
+                setattr(
+                    self,
+                    ds,
+                    (
+                        getattr(self, ds).filter(partial_filter_function)
+                        if self.streaming
+                        else getattr(self, ds).filter(
+                            partial_filter_function,
+                            num_proc=self.preprocessing_num_workers,
+                            load_from_cache_file=not self.overwrite_cache,
+                            desc="Filtering datasets",
+                        )
+                    ),
                 )
 
         # normalize text
@@ -151,6 +164,25 @@ def preprocess(self, tokenizer, config):
                         ),
                     )
 
+        # blank captions
+        if self.blank_caption_prob:
+            partial_blank_caption_function = partial(
+                blank_caption_function,
+                text_column=self.text_column,
+                blank_caption_prob=self.blank_caption_prob,
+            )
+            if hasattr(self, "train_dataset"):
+                self.train_dataset = (
+                    self.train_dataset.map(partial_blank_caption_function)
+                    if self.streaming
+                    else self.train_dataset.map(
+                        partial_blank_caption_function,
+                        num_proc=self.preprocessing_num_workers,
+                        load_from_cache_file=False,
+                        desc="Blanking some captions",
+                    )
+                )
+
         # preprocess
         partial_preprocess_function = partial(
             preprocess_function,
@@ -230,8 +262,8 @@ def _dataloader_datasets_streaming(
                     dataset.set_epoch(epoch)
                     epoch += 1
                 for item in dataset:
-                    for k, v in item.items():
-                        batch[k].append(v)
+                    for k in keys:
+                        batch[k].append(item[k])
                     if len(batch[keys[0]]) == batch_size:
                         batch = {k: jnp.array(v) for k, v in batch.items()}
                         yield batch
@@ -292,6 +324,23 @@ def normalize_function(example, text_column, text_normalizer):
     return example
 
 
+def filter_function(
+    example,
+    min_clip_score,
+    max_clip_score,
+    clip_score_column,
+    filter_column,
+    filter_value,
+):
+    if min_clip_score is not None and example[clip_score_column] < min_clip_score:
+        return False
+    if max_clip_score is not None and example[clip_score_column] > max_clip_score:
+        return False
+    if filter_column is not None and example[filter_column] != filter_value:
+        return False
+    return True
+
+
 def preprocess_function(
     examples,
     tokenizer,
 
@@ -1,4 +1,5 @@
 from .configuration import DalleBartConfig
 from .modeling import DalleBart
 from .partitions import set_partitions
+from .processor import DalleBartProcessor
 from .tokenizer import DalleBartTokenizer