diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml index eec8316c5465..a64ecb7229dc 100644 --- a/.github/workflows/pr_modular_tests.yml +++ b/.github/workflows/pr_modular_tests.yml @@ -73,6 +73,7 @@ jobs: python utils/check_copies.py python utils/check_dummies.py python utils/check_support_list.py + python utils/check_forward_call_docstrings.py make deps_table_check_updated - name: Check if failure if: ${{ failure() }} diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 27adcef2422c..668b4ca33008 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -68,6 +68,7 @@ jobs: python utils/check_copies.py python utils/check_dummies.py python utils/check_support_list.py + python utils/check_forward_call_docstrings.py make deps_table_check_updated - name: Check if failure if: ${{ failure() }} diff --git a/.github/workflows/pr_tests_gpu.yml b/.github/workflows/pr_tests_gpu.yml index 41dd7781f334..ddd7d551f2de 100644 --- a/.github/workflows/pr_tests_gpu.yml +++ b/.github/workflows/pr_tests_gpu.yml @@ -69,6 +69,7 @@ jobs: python utils/check_copies.py python utils/check_dummies.py python utils/check_support_list.py + python utils/check_forward_call_docstrings.py make deps_table_check_updated - name: Check if failure if: ${{ failure() }} diff --git a/Makefile b/Makefile index 138b0bfa5101..b104e829939f 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ repo-consistency: python utils/check_dummies.py python utils/check_repo.py python utils/check_inits.py + python utils/check_forward_call_docstrings.py # this target runs checks on all files @@ -74,6 +75,10 @@ fix-copies: modular-autodoctrings: python utils/modular_auto_docstring.py +# Verify forward() / __call__() arguments are documented in their docstrings +check-forward-call-docstrings: + python utils/check_forward_call_docstrings.py + # Run tests for the library test: diff --git a/src/diffusers/models/adapter.py b/src/diffusers/models/adapter.py index f0652c581a3e..3cf959fc3376 100644 --- a/src/diffusers/models/adapter.py +++ b/src/diffusers/models/adapter.py @@ -269,6 +269,10 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]: each representing information extracted at a different scale from the input. The length of the list is determined by the number of downsample blocks in the Adapter, as specified by the `channels` and `num_res_blocks` parameters during initialization. + + Args: + x (`torch.Tensor`): + The input tensor to process through the adapter model. """ return self.adapter(x) diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index fbd9b3e459f7..1614164b400d 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -166,6 +166,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 02a83d79aba5..3ec59673d0a0 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -706,6 +706,12 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> DecoderOutp return DecoderOutput(sample=decoded) def forward(self, sample: torch.Tensor, return_dict: bool = True) -> torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + """ encoded = self.encode(sample, return_dict=False)[0] decoded = self.decode(encoded, return_dict=False)[0] if not return_dict: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index d2e7318f5679..2ce9b0179b30 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -424,6 +424,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 9921e3932465..cf6a2e838008 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -1409,6 +1409,17 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> torch.Tensor | torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py b/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py index 4fe1f62890be..199c244421d5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py @@ -1078,6 +1078,17 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> tuple[torch.Tensor] | DecoderOutput: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py b/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py index 36ce143ebd07..83b2eb0b885b 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py @@ -441,6 +441,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py index a19c267b6d36..f407d38c93e2 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py @@ -1061,6 +1061,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py index 6922ac853554..238ad8dd37d2 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py @@ -674,8 +674,13 @@ def forward( """ Args: sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ posterior = self.encode(sample).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py index 9f53371aadf5..f2b6d1707be2 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py @@ -908,6 +908,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py index e43483b92240..374e7011a2eb 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py @@ -941,6 +941,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_kvae.py b/src/diffusers/models/autoencoders/autoencoder_kl_kvae.py index 1bd2363af448..e429eac3a4ff 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_kvae.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_kvae.py @@ -787,6 +787,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_kvae_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_kvae_video.py index 7038f45fc30e..d853ed9f5a93 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_kvae_video.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_kvae_video.py @@ -942,6 +942,17 @@ def forward( return_dict: bool = True, generator: Optional[torch.Generator] = None, ) -> Union[DecoderOutput, torch.Tensor]: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index a7acc105e9ec..d0104392e58a 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -1522,6 +1522,19 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> torch.Tensor | torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + temb (`torch.Tensor`, *optional*): + Optional timestep embedding tensor used to condition the decoder. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2.py index f4f7d46628c8..9e4bdad8fd8f 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2.py @@ -1542,6 +1542,23 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> torch.Tensor | torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + temb (`torch.Tensor`, *optional*): + Optional timestep embedding tensor used to condition the decoder. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + encoder_causal (`bool`, *optional*): + Whether the encoder should use causal convolutions. If `None`, falls back to the model default. + decoder_causal (`bool`, *optional*): + Whether the decoder should use causal convolutions. If `None`, falls back to the model default. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x, causal=encoder_causal).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py index f9390dab5b74..5826519ff3de 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py @@ -792,6 +792,17 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> DecoderOutput | torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ posterior = self.encode(sample).latent_dist if sample_posterior: z = posterior.sample(generator=generator) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py b/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py index ea0e2cd00d52..1bd27c1f6fe2 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py @@ -1057,6 +1057,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py index a0f831c867b0..d353bc80acb7 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py @@ -1093,6 +1093,17 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> torch.Tensor | torch.Tensor: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample posterior = self.encode(x).latent_dist if sample_posterior: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py index eb45c3c7ee3c..f3babf3039d5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py @@ -1043,8 +1043,13 @@ def forward( """ Args: sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index 95d4b0b7b535..285f7ce848f5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -287,6 +287,11 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + num_frames (`int`, *optional*, defaults to 1): + The number of frames to decode per batch. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py index 7ba0de0f4a18..a4e456969203 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -1416,8 +1416,13 @@ def forward( """ Args: sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_longcat_audio_dit.py b/src/diffusers/models/autoencoders/autoencoder_longcat_audio_dit.py index 455599a30f60..c69dab831728 100644 --- a/src/diffusers/models/autoencoders/autoencoder_longcat_audio_dit.py +++ b/src/diffusers/models/autoencoders/autoencoder_longcat_audio_dit.py @@ -393,6 +393,17 @@ def forward( return_dict: bool = True, generator: torch.Generator | None = None, ) -> LongCatAudioDiTVaeDecoderOutput | tuple[torch.Tensor]: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`LongCatAudioDiTVaeDecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ latents = self.encode(sample, sample_posterior=sample_posterior, return_dict=True, generator=generator).latents decoded = self.decode(latents, return_dict=True).sample if not return_dict: diff --git a/src/diffusers/models/autoencoders/autoencoder_oobleck.py b/src/diffusers/models/autoencoders/autoencoder_oobleck.py index d01018213897..0e51c2c636b1 100644 --- a/src/diffusers/models/autoencoders/autoencoder_oobleck.py +++ b/src/diffusers/models/autoencoders/autoencoder_oobleck.py @@ -528,6 +528,9 @@ def forward( Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`OobleckDecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. """ x = sample posterior = self.encode(x).latent_dist diff --git a/src/diffusers/models/autoencoders/autoencoder_rae.py b/src/diffusers/models/autoencoders/autoencoder_rae.py index 58ea66f8d18d..432a8fe32217 100644 --- a/src/diffusers/models/autoencoders/autoencoder_rae.py +++ b/src/diffusers/models/autoencoders/autoencoder_rae.py @@ -682,6 +682,15 @@ def decode(self, z: torch.Tensor, return_dict: bool = True) -> DecoderOutput | t def forward( self, sample: torch.Tensor, return_dict: bool = True, generator: torch.Generator | None = None ) -> DecoderOutput | tuple[torch.Tensor]: + r""" + Args: + sample (`torch.Tensor`): Input sample. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ latents = self.encode(sample, return_dict=False, generator=generator)[0] decoded = self.decode(latents, return_dict=False)[0] if not return_dict: diff --git a/src/diffusers/models/autoencoders/autoencoder_vidtok.py b/src/diffusers/models/autoencoders/autoencoder_vidtok.py index 4f05afb8a21d..36ce0726313e 100644 --- a/src/diffusers/models/autoencoders/autoencoder_vidtok.py +++ b/src/diffusers/models/autoencoders/autoencoder_vidtok.py @@ -1440,6 +1440,19 @@ def forward( return_dict: bool = True, generator: Optional[torch.Generator] = None, ) -> Union[torch.Tensor, DecoderOutput]: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `True`): + Whether to sample from the posterior. + encoder_mode (`bool`, *optional*, defaults to `False`): + If `True`, only run the encoder and return the encoded latent without decoding. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make sampling + deterministic. + """ x = sample res = 1 if self.is_causal else 0 if self.is_causal: diff --git a/src/diffusers/models/controlnets/controlnet_flux.py b/src/diffusers/models/controlnets/controlnet_flux.py index 56482c299c05..787629b70396 100644 --- a/src/diffusers/models/controlnets/controlnet_flux.py +++ b/src/diffusers/models/controlnets/controlnet_flux.py @@ -188,8 +188,12 @@ def forward( from the embeddings of input conditions. timestep ( `torch.LongTensor`): Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): - A list of tensors that if specified are added to the residuals of transformer blocks. + img_ids (`torch.Tensor`): + Positional ids for the image tokens. + txt_ids (`torch.Tensor`): + Positional ids for the text tokens. + guidance (`torch.Tensor`, *optional*): + Guidance scale tensor used by guidance-distilled variants of the model. joint_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in @@ -355,6 +359,35 @@ def forward( joint_attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> FluxControlNetOutput | tuple: + r""" + Args: + hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + Input `hidden_states`. + controlnet_cond (`list` of `torch.Tensor`): + A list of conditional input tensors, one per ControlNet. + controlnet_mode (`list` of `torch.Tensor`): + A list of mode tensors selecting the control type for each ControlNet. + conditioning_scale (`list` of `float`): + A list of scale factors applied to the ControlNet outputs. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): + Embeddings projected from the embeddings of input conditions. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + img_ids (`torch.Tensor`): + Positional ids for the image tokens. + txt_ids (`torch.Tensor`): + Positional ids for the text tokens. + guidance (`torch.Tensor`, *optional*): + Guidance scale tensor used by guidance-distilled variants of the model. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`FluxControlNetOutput`] instead of a plain tuple. + """ # ControlNet-Union with multiple conditions # only load one ControlNet for saving memories if len(self.nets) == 1: diff --git a/src/diffusers/models/controlnets/controlnet_qwenimage.py b/src/diffusers/models/controlnets/controlnet_qwenimage.py index cfe7c159ad89..30f98cfd59d0 100644 --- a/src/diffusers/models/controlnets/controlnet_qwenimage.py +++ b/src/diffusers/models/controlnets/controlnet_qwenimage.py @@ -286,6 +286,32 @@ def forward( joint_attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> QwenImageControlNetOutput | tuple: + r""" + Args: + hidden_states (`torch.FloatTensor`): + Input `hidden_states`. + controlnet_cond (`list` of `torch.Tensor`): + A list of conditional input tensors, one per ControlNet. + conditioning_scale (`list` of `float`): + A list of scale factors applied to the ControlNet outputs. + encoder_hidden_states (`torch.Tensor`, *optional*): + Conditional embeddings (embeddings computed from the input conditions such as prompts). + encoder_hidden_states_mask (`torch.Tensor`, *optional*): + Mask for the encoder hidden states. + timestep (`torch.LongTensor`, *optional*): + Used to indicate denoising step. + img_shapes (`list` of `tuple[int, int, int]`, *optional*): + Per-sample image shapes used to construct positional encodings. + txt_seq_lens (`list` of `int`, *optional*): + Deprecated. The text sequence length is now inferred from `encoder_hidden_states` and + `encoder_hidden_states_mask`. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`QwenImageControlNetOutput`] instead of a plain tuple. + """ if txt_seq_lens is not None: deprecate( "txt_seq_lens", diff --git a/src/diffusers/models/controlnets/controlnet_sana.py b/src/diffusers/models/controlnets/controlnet_sana.py index 29e5591fa284..283e60628036 100644 --- a/src/diffusers/models/controlnets/controlnet_sana.py +++ b/src/diffusers/models/controlnets/controlnet_sana.py @@ -130,6 +130,30 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor, ...] | Transformer2DModelOutput: + r""" + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, channel, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + controlnet_cond (`torch.Tensor`): + The conditional input tensor for the ControlNet. + conditioning_scale (`float`, *optional*, defaults to `1.0`): + The scale factor for ControlNet outputs. + encoder_attention_mask (`torch.Tensor`, *optional*): + Attention mask applied to `encoder_hidden_states`. + attention_mask (`torch.Tensor`, *optional*): + Attention mask applied to `hidden_states`. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py index b8cb97adb41a..0a195ce54e67 100644 --- a/src/diffusers/models/controlnets/controlnet_sd3.py +++ b/src/diffusers/models/controlnets/controlnet_sd3.py @@ -402,6 +402,27 @@ def forward( joint_attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> SD3ControlNetOutput | tuple: + r""" + Args: + hidden_states (`torch.Tensor`): + Input `hidden_states`. + controlnet_cond (`list` of `torch.Tensor`): + A list of conditional input tensors, one per ControlNet. + conditioning_scale (`list` of `float`): + A list of scale factors applied to the ControlNet outputs. + pooled_projections (`torch.Tensor`): + Embeddings projected from the embeddings of input conditions. + encoder_hidden_states (`torch.Tensor`, *optional*): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`, *optional*): + Used to indicate denoising step. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`SD3ControlNetOutput`] instead of a plain tuple. + """ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): block_samples = controlnet( hidden_states=hidden_states, diff --git a/src/diffusers/models/controlnets/controlnet_sparsectrl.py b/src/diffusers/models/controlnets/controlnet_sparsectrl.py index 7da627fe6dd4..715d9dad2c34 100644 --- a/src/diffusers/models/controlnets/controlnet_sparsectrl.py +++ b/src/diffusers/models/controlnets/controlnet_sparsectrl.py @@ -558,8 +558,6 @@ def forward( The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. - class_labels (`torch.Tensor`, *optional*, defaults to `None`): - Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. timestep_cond (`torch.Tensor`, *optional*, defaults to `None`): Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep @@ -568,8 +566,8 @@ def forward( An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large negative values to the attention scores corresponding to "discard" tokens. - added_cond_kwargs (`dict`): - Additional conditions for the Stable Diffusion XL UNet. + conditioning_mask (`torch.Tensor`, *optional*, defaults to `None`): + Optional mask indicating which frames in `controlnet_cond` are valid conditioning frames. cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`): A kwargs dictionary that if specified is passed along to the `AttnProcessor`. guess_mode (`bool`, defaults to `False`): diff --git a/src/diffusers/models/controlnets/controlnet_z_image.py b/src/diffusers/models/controlnets/controlnet_z_image.py index 85fa0d365547..a4800b255ef0 100644 --- a/src/diffusers/models/controlnets/controlnet_z_image.py +++ b/src/diffusers/models/controlnets/controlnet_z_image.py @@ -661,6 +661,23 @@ def forward( patch_size=2, f_patch_size=1, ): + r""" + Args: + x (`list` of `torch.Tensor`): + A list of input image latents, one tensor per sample in the batch. + t (`torch.Tensor`): + Timestep tensor used to indicate the denoising step. + cap_feats (`list` of `torch.Tensor`): + A list of caption (text) feature tensors, one per sample. + control_context (`list` of `torch.Tensor`): + A list of control conditioning feature tensors, one per sample. + conditioning_scale (`float`, *optional*, defaults to `1.0`): + The scale factor for ControlNet outputs. + patch_size (`int`, *optional*, defaults to `2`): + Spatial patch size used to tokenize the latent. + f_patch_size (`int`, *optional*, defaults to `1`): + Temporal (frame) patch size used to tokenize the latent. + """ if ( self.t_scale is None or self.t_embedder is None diff --git a/src/diffusers/models/controlnets/multicontrolnet.py b/src/diffusers/models/controlnets/multicontrolnet.py index 705c59c0f925..c28445213172 100644 --- a/src/diffusers/models/controlnets/multicontrolnet.py +++ b/src/diffusers/models/controlnets/multicontrolnet.py @@ -44,6 +44,34 @@ def forward( guess_mode: bool = False, return_dict: bool = True, ) -> ControlNetOutput | tuple: + r""" + Args: + sample (`torch.Tensor`): + The noisy input tensor. + timestep (`torch.Tensor`, `float`, or `int`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + controlnet_cond (`list` of `torch.Tensor`): + A list of conditional input tensors, one per ControlNet. + conditioning_scale (`list` of `float`): + A list of scale factors applied to the ControlNet outputs. + class_labels (`torch.Tensor`, *optional*): + Optional class labels for conditioning. + timestep_cond (`torch.Tensor`, *optional*): + Additional conditional embeddings for timestep. + attention_mask (`torch.Tensor`, *optional*): + Attention mask applied to `encoder_hidden_states`. + added_cond_kwargs (`dict`, *optional*): + Additional conditions for the Stable Diffusion XL UNet. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, *optional*, defaults to `False`): + In this mode, the ControlNet encoder tries its best to recognize the input content even if you remove + all prompts. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`ControlNetOutput`] instead of a plain tuple. + """ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): down_samples, mid_sample = controlnet( sample=sample, diff --git a/src/diffusers/models/controlnets/multicontrolnet_union.py b/src/diffusers/models/controlnets/multicontrolnet_union.py index 98552f99623a..b832e138a4a6 100644 --- a/src/diffusers/models/controlnets/multicontrolnet_union.py +++ b/src/diffusers/models/controlnets/multicontrolnet_union.py @@ -47,6 +47,38 @@ def forward( guess_mode: bool = False, return_dict: bool = True, ) -> ControlNetOutput | tuple: + r""" + Args: + sample (`torch.Tensor`): + The noisy input tensor. + timestep (`torch.Tensor`, `float`, or `int`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + controlnet_cond (`list` of `torch.Tensor`): + A list of conditional input tensors, one per ControlNet. + control_type (`list` of `torch.Tensor`): + A list of control type tensors, one per ControlNet, indicating the active control types. + control_type_idx (`list` of `list` of `int`): + Per-ControlNet list of control type indices corresponding to `controlnet_cond`. + conditioning_scale (`list` of `float`): + A list of scale factors applied to the ControlNet outputs. + class_labels (`torch.Tensor`, *optional*): + Optional class labels for conditioning. + timestep_cond (`torch.Tensor`, *optional*): + Additional conditional embeddings for timestep. + attention_mask (`torch.Tensor`, *optional*): + Attention mask applied to `encoder_hidden_states`. + added_cond_kwargs (`dict`, *optional*): + Additional conditions for the Stable Diffusion XL UNet. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, *optional*, defaults to `False`): + In this mode, the ControlNet encoder tries its best to recognize the input content even if you remove + all prompts. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`ControlNetOutput`] instead of a plain tuple. + """ down_block_res_samples, mid_block_res_sample = None, None for i, (image, ctype, ctype_idx, scale, controlnet) in enumerate( zip(controlnet_cond, control_type, control_type_idx, conditioning_scale, self.nets) diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index 3fa4df738784..ff6c0c78a53b 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -406,6 +406,28 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`AuraFlowTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ height, width = hidden_states.shape[-2:] # Apply patch embedding, timestep embedding, and project the caption embeddings. diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py index 4b8beeeb6fe3..08299f05e1b8 100644 --- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py +++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py @@ -375,6 +375,35 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`CogVideoXTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_frames, channels, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + timestep_cond (`torch.Tensor`, *optional*): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the final timestep embeddings. + ofs (`torch.Tensor`, *optional*): + Offset embeddings used in CogVideoX-5b-I2V. + image_rotary_emb (`tuple` of `torch.Tensor`, *optional*): + Pre-computed rotary positional embeddings. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_frames, channels, height, width = hidden_states.shape # 1. Time embedding diff --git a/src/diffusers/models/transformers/consisid_transformer_3d.py b/src/diffusers/models/transformers/consisid_transformer_3d.py index 64a58e394366..e534f9479311 100644 --- a/src/diffusers/models/transformers/consisid_transformer_3d.py +++ b/src/diffusers/models/transformers/consisid_transformer_3d.py @@ -633,6 +633,37 @@ def forward( id_vit_hidden: torch.Tensor | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`ConsisIDTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_frames, channels, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + timestep_cond (`torch.Tensor`, *optional*): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the final timestep embeddings. + image_rotary_emb (`tuple` of `torch.Tensor`, *optional*): + Pre-computed rotary positional embeddings. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + id_cond (`torch.Tensor`, *optional*): + The face embedding extracted by the local facial extractor used for identity conditioning. + id_vit_hidden (`torch.Tensor`, *optional*): + The ViT hidden states extracted from face images used for identity conditioning. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ # fuse clip and insightface valid_face_emb = None if self.is_train_face: diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py index a25aa99fb8b9..83b3797c4fc3 100644 --- a/src/diffusers/models/transformers/hunyuan_transformer_2d.py +++ b/src/diffusers/models/transformers/hunyuan_transformer_2d.py @@ -392,6 +392,8 @@ def forward( Conditional embedding indicate the style image_rotary_emb (`torch.Tensor`): The image rotary embeddings to apply on query and key tensors during attention calculation. + controlnet_block_samples (`list` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of transformer blocks. return_dict: bool Whether to return a dictionary. """ diff --git a/src/diffusers/models/transformers/latte_transformer_3d.py b/src/diffusers/models/transformers/latte_transformer_3d.py index 32e97aff8fb7..01a1e608a927 100644 --- a/src/diffusers/models/transformers/latte_transformer_3d.py +++ b/src/diffusers/models/transformers/latte_transformer_3d.py @@ -176,7 +176,7 @@ def forward( The [`LatteTransformer3DModel`] forward method. Args: - hidden_states shape `(batch size, channel, num_frame, height, width)`: + hidden_states (`torch.Tensor` of shape `(batch size, channel, num_frame, height, width)`): Input `hidden_states`. timestep ( `torch.LongTensor`, *optional*): Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. diff --git a/src/diffusers/models/transformers/lumina_nextdit2d.py b/src/diffusers/models/transformers/lumina_nextdit2d.py index 46a6753b4cb1..e4fd4ce601db 100644 --- a/src/diffusers/models/transformers/lumina_nextdit2d.py +++ b/src/diffusers/models/transformers/lumina_nextdit2d.py @@ -306,6 +306,15 @@ def forward( timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,). encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D). encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L). + image_rotary_emb (`torch.Tensor`): + Pre-computed rotary positional embeddings. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. """ hidden_states, mask, img_size, image_rotary_emb = self.patch_embedder(hidden_states, image_rotary_emb) image_rotary_emb = image_rotary_emb.to(hidden_states.device) diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py index ff078dc695d7..633ee7ae590c 100644 --- a/src/diffusers/models/transformers/sana_transformer.py +++ b/src/diffusers/models/transformers/sana_transformer.py @@ -427,6 +427,36 @@ def forward( controlnet_block_samples: tuple[torch.Tensor] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor, ...] | Transformer2DModelOutput: + """ + The [`SanaTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding. + encoder_attention_mask (`torch.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. + attention_mask (`torch.Tensor`, *optional*): + Self-attention mask applied to `hidden_states`. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_block_samples (`tuple` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of transformer blocks. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. diff --git a/src/diffusers/models/transformers/t5_film_transformer.py b/src/diffusers/models/transformers/t5_film_transformer.py index 1ae2b1e3fedb..95526a4527ce 100644 --- a/src/diffusers/models/transformers/t5_film_transformer.py +++ b/src/diffusers/models/transformers/t5_film_transformer.py @@ -90,6 +90,18 @@ def encoder_decoder_mask(self, query_input: torch.Tensor, key_input: torch.Tenso return mask.unsqueeze(-3) def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): + """ + The [`T5FilmDecoder`] forward method. + + Args: + encodings_and_masks (`list` of `tuple` of `torch.Tensor`): + A list of `(encoding, mask)` tuples produced by upstream encoders. The encodings are concatenated and + cross-attended to by the decoder. + decoder_input_tokens (`torch.Tensor` of shape `(batch_size, seq_length, input_dims)`): + Input tokens for the decoder. + decoder_noise_time (`torch.Tensor` of shape `(batch_size,)`): + Diffusion timesteps in `[0, 1)` used to condition the decoder. + """ batch, _, _ = decoder_input_tokens.shape assert decoder_noise_time.shape == (batch,) diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py index 934e2787674f..abe82ab578de 100644 --- a/src/diffusers/models/transformers/transformer_allegro.py +++ b/src/diffusers/models/transformers/transformer_allegro.py @@ -312,6 +312,30 @@ def forward( image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, return_dict: bool = True, ): + """ + The [`AllegroTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + attention_mask (`torch.Tensor`, *optional*): + Self-attention mask applied to `hidden_states`. + encoder_attention_mask (`torch.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. + image_rotary_emb (`tuple` of `torch.Tensor`, *optional*): + Pre-computed rotary positional embeddings. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t = self.config.patch_size_t p = self.config.patch_size diff --git a/src/diffusers/models/transformers/transformer_bria.py b/src/diffusers/models/transformers/transformer_bria.py index 99b7bbfd64cf..8e79046508e9 100644 --- a/src/diffusers/models/transformers/transformer_bria.py +++ b/src/diffusers/models/transformers/transformer_bria.py @@ -608,8 +608,16 @@ def forward( from the embeddings of input conditions. timestep ( `torch.LongTensor`): Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. + controlnet_block_samples (`list` of `torch.Tensor`, *optional*): A list of tensors that if specified are added to the residuals of transformer blocks. + controlnet_single_block_samples (`list` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of single transformer blocks. attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/transformers/transformer_bria_fibo.py b/src/diffusers/models/transformers/transformer_bria_fibo.py index 7ddbccfa47c5..31c826bbf6b2 100644 --- a/src/diffusers/models/transformers/transformer_bria_fibo.py +++ b/src/diffusers/models/transformers/transformer_bria_fibo.py @@ -529,10 +529,18 @@ def forward( Input `hidden_states`. encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`): Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + text_encoder_layers (`list` of `torch.Tensor`): + Per-block text encoder hidden states, one tensor per transformer block. pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected from the embeddings of input conditions. timestep ( `torch.LongTensor`): Used to indicate denoising step. + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. joint_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py index d7cc96d018b3..8d7d9d5d6a04 100644 --- a/src/diffusers/models/transformers/transformer_chroma.py +++ b/src/diffusers/models/transformers/transformer_chroma.py @@ -498,8 +498,18 @@ def forward( Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. timestep ( `torch.LongTensor`): Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + attention_mask (`torch.Tensor`, *optional*): + Mask applied to `encoder_hidden_states` during attention. + controlnet_block_samples (`list` of `torch.Tensor`, *optional*): A list of tensors that if specified are added to the residuals of transformer blocks. + controlnet_single_block_samples (`list` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of single transformer blocks. + controlnet_blocks_repeat (`bool`, *optional*, defaults to `False`): + Whether to repeat the controlnet block samples across all transformer blocks. joint_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/transformers/transformer_chronoedit.py b/src/diffusers/models/transformers/transformer_chronoedit.py index 25eb6f87a93a..b39a18a98afb 100644 --- a/src/diffusers/models/transformers/transformer_chronoedit.py +++ b/src/diffusers/models/transformers/transformer_chronoedit.py @@ -651,6 +651,30 @@ def forward( return_dict: bool = True, attention_kwargs: dict[str, Any] | None = None, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`ChronoEditTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_hidden_states_image (`torch.Tensor`, *optional*): + Conditional image embeddings for image-conditioned generation. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t, p_h, p_w = self.config.patch_size post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index 308e0e6cccaf..2856fffd2a63 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -713,6 +713,38 @@ def forward( attention_mask: torch.Tensor | None = None, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`CogView4Transformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + original_size (`torch.Tensor`): + Original image size conditioning. + target_size (`torch.Tensor`): + Target image size conditioning. + crop_coords (`torch.Tensor`): + Crop coordinates conditioning. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_mask (`torch.Tensor`, *optional*): + Mask applied to attention scores. + image_rotary_emb (`tuple` of `torch.Tensor`, *optional*): + Pre-computed rotary positional embeddings. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, height, width = hidden_states.shape # 1. RoPE diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py index a3ecc8f53191..d901bb5809de 100644 --- a/src/diffusers/models/transformers/transformer_cosmos.py +++ b/src/diffusers/models/transformers/transformer_cosmos.py @@ -697,6 +697,34 @@ def forward( padding_mask: torch.Tensor | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`CosmosTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + block_controlnet_hidden_states (`list` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of transformer blocks. + attention_mask (`torch.Tensor`, *optional*): + Mask applied to `encoder_hidden_states` during attention. + fps (`int`, *optional*): + Frames per second of the input video used to compute the rotary positional embeddings. + condition_mask (`torch.Tensor`, *optional*): + Mask channel concatenated to `hidden_states` to indicate the conditioning region. + padding_mask (`torch.Tensor`, *optional*): + Padding mask concatenated to `hidden_states` when `concat_padding_mask` is enabled. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape # 1. Concatenate padding mask if needed & prepare attention mask diff --git a/src/diffusers/models/transformers/transformer_easyanimate.py b/src/diffusers/models/transformers/transformer_easyanimate.py index a665d420c230..24c874ad40ef 100755 --- a/src/diffusers/models/transformers/transformer_easyanimate.py +++ b/src/diffusers/models/transformers/transformer_easyanimate.py @@ -469,6 +469,33 @@ def forward( control_latents: torch.Tensor | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`EasyAnimateTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + timestep_cond (`torch.Tensor`, *optional*): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the final timestep embeddings. + encoder_hidden_states (`torch.Tensor`, *optional*): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_hidden_states_t5 (`torch.Tensor`, *optional*): + Additional conditional embeddings computed from a T5 text encoder. + inpaint_latents (`torch.Tensor`, *optional*): + Latents concatenated to `hidden_states` for inpainting variants of the model. + control_latents (`torch.Tensor`, *optional*): + Latents concatenated to `hidden_states` for control variants of the model. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, channels, video_length, height, width = hidden_states.size() p = self.config.patch_size post_patch_height = height // p diff --git a/src/diffusers/models/transformers/transformer_ernie_image.py b/src/diffusers/models/transformers/transformer_ernie_image.py index 473fc1039dc8..abb79b527589 100644 --- a/src/diffusers/models/transformers/transformer_ernie_image.py +++ b/src/diffusers/models/transformers/transformer_ernie_image.py @@ -350,6 +350,23 @@ def forward( text_lens: torch.Tensor, return_dict: bool = True, ): + """ + The [`ErnieImageTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + text_bth (`torch.Tensor`): + Conditional text embeddings (embeddings computed from the input conditions such as prompts) to use, + shaped `(batch_size, text_length, embed_dims)`. + text_lens (`torch.Tensor`): + Per-sample text sequence lengths used to build the attention mask. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ device, dtype = hidden_states.device, hidden_states.dtype B, C, H, W = hidden_states.shape p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py index 78a77ebcfea9..13177bc67878 100644 --- a/src/diffusers/models/transformers/transformer_flux.py +++ b/src/diffusers/models/transformers/transformer_flux.py @@ -662,8 +662,18 @@ def forward( from the embeddings of input conditions. timestep ( `torch.LongTensor`): Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. + controlnet_block_samples (`list` of `torch.Tensor`, *optional*): A list of tensors that if specified are added to the residuals of transformer blocks. + controlnet_single_block_samples (`list` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of single transformer blocks. + controlnet_blocks_repeat (`bool`, *optional*, defaults to `False`): + Whether to repeat the controlnet block samples across all transformer blocks. joint_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/transformers/transformer_flux2.py b/src/diffusers/models/transformers/transformer_flux2.py index 5c90f3a46a98..e56f18f788e9 100644 --- a/src/diffusers/models/transformers/transformer_flux2.py +++ b/src/diffusers/models/transformers/transformer_flux2.py @@ -1201,6 +1201,12 @@ def forward( Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. timestep (`torch.LongTensor`): Used to indicate denoising step. + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. joint_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/transformers/transformer_glm_image.py b/src/diffusers/models/transformers/transformer_glm_image.py index b151e9809ef2..e2d883d2fecd 100644 --- a/src/diffusers/models/transformers/transformer_glm_image.py +++ b/src/diffusers/models/transformers/transformer_glm_image.py @@ -609,6 +609,42 @@ def forward( kv_caches: GlmImageKVCache | None = None, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`GlmImageTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + prior_token_id (`torch.Tensor`): + Token ids for the prior embedding lookup. + prior_token_drop (`torch.Tensor`): + Boolean mask indicating which prior embeddings should be dropped (zeroed out). + timestep (`torch.LongTensor`): + Used to indicate denoising step. + target_size (`torch.Tensor`): + Target image size conditioning. + crop_coords (`torch.Tensor`): + Crop coordinates conditioning. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_mask (`torch.Tensor`, *optional*): + Mask applied to attention scores. + kv_caches (`GlmImageKVCache`, *optional*): + Pre-computed key/value caches used to speed up inference. + image_rotary_emb (`tuple` of `torch.Tensor`, *optional*): + Pre-computed rotary positional embeddings. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, height, width = hidden_states.shape # 1. RoPE diff --git a/src/diffusers/models/transformers/transformer_helios.py b/src/diffusers/models/transformers/transformer_helios.py index 922b0724c87e..c9c2a8ae0293 100644 --- a/src/diffusers/models/transformers/transformer_helios.py +++ b/src/diffusers/models/transformers/transformer_helios.py @@ -671,6 +671,42 @@ def forward( return_dict: bool = True, attention_kwargs: dict[str, Any] | None = None, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`HeliosTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + indices_hidden_states (`torch.Tensor`, *optional*): + Frame indices for `hidden_states` used to compute the rotary positional embeddings. + indices_latents_history_short (`torch.Tensor`, *optional*): + Frame indices for the short history latents. + indices_latents_history_mid (`torch.Tensor`, *optional*): + Frame indices for the mid history latents. + indices_latents_history_long (`torch.Tensor`, *optional*): + Frame indices for the long history latents. + latents_history_short (`torch.Tensor`, *optional*): + Short history latents conditioning. + latents_history_mid (`torch.Tensor`, *optional*): + Mid history latents conditioning. + latents_history_long (`torch.Tensor`, *optional*): + Long history latents conditioning. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ # 1. Input batch_size = hidden_states.shape[0] p_t, p_h, p_w = self.config.patch_size diff --git a/src/diffusers/models/transformers/transformer_hidream_image.py b/src/diffusers/models/transformers/transformer_hidream_image.py index 6b1e4d183737..b6c0e3533657 100644 --- a/src/diffusers/models/transformers/transformer_hidream_image.py +++ b/src/diffusers/models/transformers/transformer_hidream_image.py @@ -788,6 +788,38 @@ def forward( return_dict: bool = True, **kwargs, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`HiDreamImageTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)` or `(batch_size, patch_height * patch_width, patch_size * patch_size * channels)`): + Input `hidden_states`. + timesteps (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states_t5 (`torch.Tensor`): + Conditional embeddings computed from the T5 text encoder. + encoder_hidden_states_llama3 (`torch.Tensor`): + Conditional embeddings computed from the Llama3 text encoder. + pooled_embeds (`torch.Tensor`): + Pooled text embeddings used for additional conditioning. + img_ids (`torch.Tensor`, *optional*): + Image position ids for the patched hidden states. + img_sizes (`list` of `tuple` of `int`, *optional*): + Per-sample patch grid sizes used to unpatchify the output. + hidden_states_masks (`torch.Tensor`, *optional*): + Mask over patched `hidden_states`. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ encoder_hidden_states = kwargs.get("encoder_hidden_states", None) if encoder_hidden_states is not None: diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 1db643a60f81..3730cc8ffa56 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -1003,6 +1003,34 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`HunyuanVideoTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): + Embeddings projected from the embeddings of input conditions. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p, p_t = self.config.patch_size, self.config.patch_size_t post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video15.py b/src/diffusers/models/transformers/transformer_hunyuan_video15.py index 222b0791d650..64c18e541d7c 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video15.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video15.py @@ -634,6 +634,38 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`HunyuanVideo15Transformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + timestep_r (`torch.LongTensor`, *optional*): + Refiner timestep conditioning. + encoder_hidden_states_2 (`torch.Tensor`, *optional*): + Additional conditional embeddings computed from a second text encoder (ByT5). + encoder_attention_mask_2 (`torch.Tensor`, *optional*): + Mask applied to `encoder_hidden_states_2` during attention. + image_embeds (`torch.Tensor`, *optional*): + Image embeddings for image-conditioned generation. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t, p_h, p_w = self.config.patch_size_t, self.config.patch_size, self.config.patch_size post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index f005c4d4cd51..9a3dbc00f4ec 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -218,6 +218,50 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor] | Transformer2DModelOutput: + """ + The [`HunyuanVideoFramepackTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): + Embeddings projected from the embeddings of input conditions. + image_embeds (`torch.Tensor`): + Image embeddings for image-conditioned generation. + indices_latents (`torch.Tensor`): + Frame indices for `hidden_states` used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. + latents_clean (`torch.Tensor`, *optional*): + Clean (denoised) history latents conditioning. + indices_latents_clean (`torch.Tensor`, *optional*): + Frame indices for `latents_clean`. + latents_history_2x (`torch.Tensor`, *optional*): + 2x downsampled history latents conditioning. + indices_latents_history_2x (`torch.Tensor`, *optional*): + Frame indices for `latents_history_2x`. + latents_history_4x (`torch.Tensor`, *optional*): + 4x downsampled history latents conditioning. + indices_latents_history_4x (`torch.Tensor`, *optional*): + Frame indices for `latents_history_4x`. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p, p_t = self.config.patch_size, self.config.patch_size_t post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_hunyuanimage.py b/src/diffusers/models/transformers/transformer_hunyuanimage.py index a2d3d9229963..dd2176a4096f 100644 --- a/src/diffusers/models/transformers/transformer_hunyuanimage.py +++ b/src/diffusers/models/transformers/transformer_hunyuanimage.py @@ -754,6 +754,38 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`HunyuanImageTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + timestep_r (`torch.LongTensor`, *optional*): + Refiner timestep conditioning. + encoder_hidden_states_2 (`torch.Tensor`, *optional*): + Additional conditional embeddings computed from a second text encoder. + encoder_attention_mask_2 (`torch.Tensor`, *optional*): + Mask applied to `encoder_hidden_states_2` during attention. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ if hidden_states.ndim == 4: batch_size, channels, height, width = hidden_states.shape sizes = (height, width) diff --git a/src/diffusers/models/transformers/transformer_joyimage.py b/src/diffusers/models/transformers/transformer_joyimage.py index 3a8e496d1218..b17ddb05f799 100644 --- a/src/diffusers/models/transformers/transformer_joyimage.py +++ b/src/diffusers/models/transformers/transformer_joyimage.py @@ -526,6 +526,20 @@ def forward( encoder_hidden_states: torch.Tensor = None, return_dict: bool = True, ): + """ + The [`JoyImageEditTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)` or `(batch_size, num_items, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor`, *optional*): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ # handle multi-item input (b, n, c, t, h, w) is_multi_item = hidden_states.ndim == 6 num_items = 0 diff --git a/src/diffusers/models/transformers/transformer_longcat_audio_dit.py b/src/diffusers/models/transformers/transformer_longcat_audio_dit.py index 2a5b169ad5ee..13eec57c07bd 100644 --- a/src/diffusers/models/transformers/transformer_longcat_audio_dit.py +++ b/src/diffusers/models/transformers/transformer_longcat_audio_dit.py @@ -545,6 +545,25 @@ def forward( latent_cond: torch.Tensor | None = None, return_dict: bool = True, ) -> LongCatAudioDiTTransformerOutput | tuple[torch.Tensor]: + """ + The [`LongCatAudioDiTTransformer`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, in_channels)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.BoolTensor`): + Mask applied to `encoder_hidden_states` during attention. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + attention_mask (`torch.BoolTensor`, *optional*): + Mask applied to `hidden_states` during self-attention. + latent_cond (`torch.Tensor`, *optional*): + Latent conditioning concatenated to `hidden_states`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`LongCatAudioDiTTransformerOutput`] instead of a plain tuple. + """ dtype = hidden_states.dtype encoder_hidden_states = encoder_hidden_states.to(dtype) timestep = timestep.to(dtype) diff --git a/src/diffusers/models/transformers/transformer_longcat_image.py b/src/diffusers/models/transformers/transformer_longcat_image.py index 7a000fa2b2ce..fe4713ea02db 100644 --- a/src/diffusers/models/transformers/transformer_longcat_image.py +++ b/src/diffusers/models/transformers/transformer_longcat_image.py @@ -483,8 +483,12 @@ def forward( Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. timestep ( `torch.LongTensor`): Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): - A list of tensors that if specified are added to the residuals of transformer blocks. + img_ids (`torch.Tensor`): + Image position ids used to compute the rotary positional embeddings. + txt_ids (`torch.Tensor`): + Text position ids used to compute the rotary positional embeddings. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding used for guidance-distilled variants of the model. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py index 0034d636761b..f5600a13b6db 100644 --- a/src/diffusers/models/transformers/transformer_ltx.py +++ b/src/diffusers/models/transformers/transformer_ltx.py @@ -506,6 +506,36 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> torch.Tensor: + """ + The [`LTXVideoTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, in_channels)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + num_frames (`int`, *optional*): + Number of frames in the video used to compute the rotary positional embeddings. + height (`int`, *optional*): + Height of the latent used to compute the rotary positional embeddings. + width (`int`, *optional*): + Width of the latent used to compute the rotary positional embeddings. + rope_interpolation_scale (`tuple` of `float` or `torch.Tensor`, *optional*): + Interpolation scale used by the rotary positional embeddings. + video_coords (`torch.Tensor`, *optional*): + Pre-computed video coordinates used by the rotary positional embeddings. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ image_rotary_emb = self.rope(hidden_states, num_frames, height, width, rope_interpolation_scale, video_coords) # convert encoder_attention_mask to a bias the same way we do for attention_mask diff --git a/src/diffusers/models/transformers/transformer_lumina2.py b/src/diffusers/models/transformers/transformer_lumina2.py index 03e2841f8bcb..ba822730cb32 100644 --- a/src/diffusers/models/transformers/transformer_lumina2.py +++ b/src/diffusers/models/transformers/transformer_lumina2.py @@ -465,6 +465,30 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> torch.Tensor | Transformer2DModelOutput: + """ + The [`Lumina2Transformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ # 1. Condition, positional & patch embedding batch_size, _, height, width = hidden_states.shape diff --git a/src/diffusers/models/transformers/transformer_mochi.py b/src/diffusers/models/transformers/transformer_mochi.py index 31106e3a0476..fe46bd5f9a98 100644 --- a/src/diffusers/models/transformers/transformer_mochi.py +++ b/src/diffusers/models/transformers/transformer_mochi.py @@ -414,6 +414,26 @@ def forward( attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True, ) -> torch.Tensor: + """ + The [`MochiTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_attention_mask (`torch.Tensor`): + Mask applied to `encoder_hidden_states` during attention. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p = self.config.patch_size diff --git a/src/diffusers/models/transformers/transformer_omnigen.py b/src/diffusers/models/transformers/transformer_omnigen.py index bd8bb107e25c..dfd922e7c988 100644 --- a/src/diffusers/models/transformers/transformer_omnigen.py +++ b/src/diffusers/models/transformers/transformer_omnigen.py @@ -415,6 +415,29 @@ def forward( position_ids: torch.Tensor, return_dict: bool = True, ) -> Transformer2DModelOutput | tuple[torch.Tensor]: + """ + The [`OmniGenTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + input_ids (`torch.Tensor`): + Multimodal text token ids used as conditioning. + input_img_latents (`list` of `torch.Tensor`): + List of latents for input images used as conditioning. + input_image_sizes (`dict` of `int` to `list` of `int`): + Mapping from sample index to the positions where input image embeddings should be placed in the + conditioning sequence. + attention_mask (`torch.Tensor`): + Attention mask for the joint multimodal sequence. + position_ids (`torch.Tensor`): + Position ids used to compute the positional embeddings. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + """ batch_size, num_channels, height, width = hidden_states.shape p = self.config.patch_size post_patch_height, post_patch_width = height // p, width // p diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py index bdb87a385da7..2385c0b1c8c3 100644 --- a/src/diffusers/models/transformers/transformer_qwenimage.py +++ b/src/diffusers/models/transformers/transformer_qwenimage.py @@ -868,6 +868,8 @@ def forward( [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). controlnet_block_samples (*optional*): ControlNet block samples to add to the transformer blocks. + additional_t_cond (`torch.Tensor`, *optional*): + Additional timestep conditioning added to the timestep embedding. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/transformers/transformer_sana_video.py b/src/diffusers/models/transformers/transformer_sana_video.py index f833c0e842c3..db1f08a73a81 100644 --- a/src/diffusers/models/transformers/transformer_sana_video.py +++ b/src/diffusers/models/transformers/transformer_sana_video.py @@ -583,6 +583,36 @@ def forward( controlnet_block_samples: tuple[torch.Tensor] | None = None, return_dict: bool = True, ) -> tuple[torch.Tensor, ...] | Transformer2DModelOutput: + """ + The [`SanaVideoTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, in_channels, num_frames, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + guidance (`torch.Tensor`, *optional*): + Guidance scale embedding. + encoder_attention_mask (`torch.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. + attention_mask (`torch.Tensor`, *optional*): + Self-attention mask applied to `hidden_states`. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_block_samples (`tuple` of `torch.Tensor`, *optional*): + A list of tensors that if specified are added to the residuals of transformer blocks. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. diff --git a/src/diffusers/models/transformers/transformer_skyreels_v2.py b/src/diffusers/models/transformers/transformer_skyreels_v2.py index 9067e32ea5c3..81caf6cb7141 100644 --- a/src/diffusers/models/transformers/transformer_skyreels_v2.py +++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py @@ -642,6 +642,34 @@ def forward( return_dict: bool = True, attention_kwargs: dict[str, Any] | None = None, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`SkyReelsV2Transformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_hidden_states_image (`torch.Tensor`, *optional*): + Conditional image embeddings for image-conditioned generation. + enable_diffusion_forcing (`bool`, *optional*, defaults to `False`): + Whether to enable diffusion forcing (per-block causal masking). + fps (`torch.Tensor`, *optional*): + FPS conditioning embedding. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t, p_h, p_w = self.config.patch_size post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index 5926bbb8e713..066c9f71f3ec 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -635,6 +635,30 @@ def forward( return_dict: bool = True, attention_kwargs: dict[str, Any] | None = None, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`WanTransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_hidden_states_image (`torch.Tensor`, *optional*): + Conditional image embeddings for image-conditioned generation. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t, p_h, p_w = self.config.patch_size post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_wan_animate.py b/src/diffusers/models/transformers/transformer_wan_animate.py index dfea5a71353d..be4fcefa2151 100644 --- a/src/diffusers/models/transformers/transformer_wan_animate.py +++ b/src/diffusers/models/transformers/transformer_wan_animate.py @@ -1188,6 +1188,10 @@ def forward( `self.config.motion_encoder_batch_size` if not set. return_dict (`bool`, *optional*, defaults to `True`): Whether to return the output as a dict or tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). """ # Check that shapes match up diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py index 46caaf579ffd..af40c7545d20 100644 --- a/src/diffusers/models/transformers/transformer_wan_vace.py +++ b/src/diffusers/models/transformers/transformer_wan_vace.py @@ -275,6 +275,34 @@ def forward( return_dict: bool = True, attention_kwargs: dict[str, Any] | None = None, ) -> torch.Tensor | dict[str, torch.Tensor]: + """ + The [`WanVACETransformer3DModel`] forward method. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + Input `hidden_states`. + timestep (`torch.LongTensor`): + Used to indicate denoising step. + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_hidden_states_image (`torch.Tensor`, *optional*): + Conditional image embeddings for image-conditioned generation. + control_hidden_states (`torch.Tensor`, *optional*): + Control latents used by the VACE control branch. + control_hidden_states_scale (`torch.Tensor`, *optional*): + Per-VACE-layer scale applied to the control hidden states. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ batch_size, num_channels, num_frames, height, width = hidden_states.shape p_t, p_h, p_w = self.config.patch_size post_patch_num_frames = num_frames // p_t diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py index ba401e7fdef1..614fb0f1210c 100644 --- a/src/diffusers/models/transformers/transformer_z_image.py +++ b/src/diffusers/models/transformers/transformer_z_image.py @@ -904,8 +904,32 @@ def forward( f_patch_size: int = 1, ): """ + The [`ZImageTransformer2DModel`] forward method. + Flow: patchify -> t_embed -> x_embed -> x_refine -> cap_embed -> cap_refine -> [siglip_embed -> siglip_refine] -> build_unified -> main_layers -> final_layer -> unpatchify + + Args: + x (`list` of `torch.Tensor` or nested `list` of `torch.Tensor`): + Input latents. A flat list when running in standard mode, or a nested list when running in omni mode. + t (`torch.Tensor`): + Used to indicate denoising step. + cap_feats (`list` of `torch.Tensor` or nested `list` of `torch.Tensor`): + Conditional caption embeddings (embeddings computed from the input conditions such as prompts) to use. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + controlnet_block_samples (`dict` of `int` to `torch.Tensor`, *optional*): + A mapping from block index to tensor that if specified are added to the residuals of transformer + blocks. + siglip_feats (`list` of `list` of `torch.Tensor`, *optional*): + Optional SigLIP image features used as additional conditioning. + image_noise_mask (`list` of `list` of `int`, *optional*): + Per-image noise masks indicating noisy vs. clean tokens in omni mode. + patch_size (`int`, *optional*, defaults to 2): + Spatial patch size used to patchify the input latents. + f_patch_size (`int`, *optional*, defaults to 1): + Temporal patch size used to patchify the input latents. """ assert patch_size in self.all_patch_size and f_patch_size in self.all_f_patch_size omni_mode = isinstance(x[0], list) diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 5c3cfe91d5bd..30fb46095326 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -461,6 +461,10 @@ def forward( Projection embeddings of the conditioning image computed with a vision encoder. encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + timestep_cond (`torch.Tensor`, *optional*): + Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the + timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep + embeddings. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index 6fa68b42ee30..7a5f5ce241be 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -147,6 +147,19 @@ def set_default_attn_processor(self): self.set_attn_processor(AttnProcessor()) def forward(self, sample, timestep, encoder_hidden_states=None, encoder_attention_mask=None, return_dict=True): + r""" + Args: + sample (`torch.Tensor`): Input sample. + timestep (`torch.Tensor`, `float`, or `int`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`, *optional*): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + encoder_attention_mask (`torch.Tensor`, *optional*): + Attention mask applied to `encoder_hidden_states`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + """ if encoder_attention_mask is not None: encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 encoder_attention_mask = encoder_attention_mask.unsqueeze(1) diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 97452eff05aa..7c4201facacf 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -1191,6 +1191,10 @@ def __init__( self.up_blocks = nn.ModuleList(up_blocks) def forward(self, sample): + r""" + Args: + sample (`torch.Tensor`): Input sample. + """ pass @@ -1909,6 +1913,8 @@ def forward( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + added_cond_kwargs (`dict`, *optional*): + A dictionary of additional embeddings (e.g. text and time embeddings) used to condition the model. down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*): A tuple of tensors that if specified are added to the residuals of down unet blocks. mid_block_additional_residual: (`torch.Tensor`, *optional*): diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index af98b7a1c602..dbf65b1f0b32 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -548,6 +548,28 @@ def forward( crp=None, return_dict=True, ): + r""" + Args: + sample (`torch.Tensor`): The noisy input sample. + timestep_ratio (`torch.Tensor`): + Timestep ratio used to compute the timestep embedding. + clip_text_pooled (`torch.Tensor`): + Pooled CLIP text embeddings. + clip_text (`torch.Tensor`, *optional*): + Sequence-level CLIP text embeddings. + clip_img (`torch.Tensor`, *optional*): + CLIP image embeddings. + effnet (`torch.Tensor`, *optional*): + EfficientNet feature map used as additional conditioning. + pixels (`torch.Tensor`, *optional*): + Pixel-level conditioning tensor. If `None`, a tensor of zeros is used. + sca (`torch.Tensor`, *optional*): + Optional `sca` conditioning value used to build the timestep embedding. + crp (`torch.Tensor`, *optional*): + Optional `crp` conditioning value used to build the timestep embedding. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`StableCascadeUNetOutput`] instead of a plain tuple. + """ if pixels is None: pixels = sample.new_zeros(sample.size(0), 3, 8, 8) diff --git a/src/diffusers/models/unets/uvit_2d.py b/src/diffusers/models/unets/uvit_2d.py index 836d41a7f946..317abe80b1eb 100644 --- a/src/diffusers/models/unets/uvit_2d.py +++ b/src/diffusers/models/unets/uvit_2d.py @@ -149,6 +149,19 @@ def __init__( @apply_lora_scale("cross_attention_kwargs") def forward(self, input_ids, encoder_hidden_states, pooled_text_emb, micro_conds, cross_attention_kwargs=None): + r""" + Args: + input_ids (`torch.LongTensor`): + Token ids of the masked latent image tokens, with shape `(batch_size, height, width)`. + encoder_hidden_states (`torch.Tensor`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + pooled_text_emb (`torch.Tensor`): + Pooled text embeddings used for additional conditioning. + micro_conds (`torch.Tensor`): + Micro-conditioning values that are embedded and combined with `pooled_text_emb`. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor`. + """ encoder_hidden_states = self.encoder_proj(encoder_hidden_states) encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states) diff --git a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py index 9a72e113abcd..1946f148f390 100644 --- a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py +++ b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py @@ -854,6 +854,15 @@ def __call__( A function called every `callback_steps` steps with `(step, timestep, latents)`. callback_steps (`int`, *optional*, defaults to 1): Frequency of the callback function. + callback_on_step_end (`Callable`, *optional*): + A function that is called at the end of each denoising step during inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. instruction (`str`, *optional*): Custom instruction text for the generation task. If not provided, it is auto-generated based on `task_type`. diff --git a/src/diffusers/pipelines/allegro/pipeline_allegro.py b/src/diffusers/pipelines/allegro/pipeline_allegro.py index e54e9ed20739..5949ed407661 100644 --- a/src/diffusers/pipelines/allegro/pipeline_allegro.py +++ b/src/diffusers/pipelines/allegro/pipeline_allegro.py @@ -797,12 +797,15 @@ def __call__( [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. + callback_on_step_end (`Callable`, *optional*): + A function that is called at the end of each denoising step during inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. clean_caption (`bool`, *optional*, defaults to `True`): Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to be installed. If the dependencies are not installed, the embeddings will be created from the raw diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 4d7477bc8754..83023a8c74d9 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -618,6 +618,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py index eb511129cc6f..be1d6d72a009 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py @@ -771,6 +771,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. @@ -830,6 +832,8 @@ def __call__( The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + decode_chunk_size (`int`, defaults to `16`): + The number of frames to decode at a time when calling `decode_latents` method. Examples: diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py index f0474487bce9..2d3752527a95 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py @@ -1034,6 +1034,9 @@ def __call__( as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py index 14605307e18c..9c65999e3a17 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py @@ -761,6 +761,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. @@ -804,6 +806,9 @@ def __call__( provided to guide the model to generate similar structure outputs, where the `unet` can "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected structure. Must have the same length as `conditioning_frames`. + guess_mode (`bool`, *optional*, defaults to `False`): + The ControlNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 4e7cd21fc25d..08c1190d9b6d 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -786,6 +786,9 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality videos at the expense of slower inference. + enforce_inference_steps (`bool`, *optional*, defaults to `False`): + Whether to enforce `num_inference_steps` denoising steps regardless of the `strength` parameter. When + `False`, the effective number of inference steps is reduced according to `strength`. timesteps (`list[int]`, *optional*): Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is @@ -802,6 +805,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py index 56ed5e23c1db..e383e9c631d0 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py @@ -956,6 +956,9 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality videos at the expense of slower inference. + enforce_inference_steps (`bool`, *optional*, defaults to `False`): + Whether to enforce `num_inference_steps` denoising steps regardless of the `strength` parameter. When + `False`, the effective number of inference steps is reduced according to `strength`. timesteps (`list[int]`, *optional*): Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is @@ -972,6 +975,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. diff --git a/src/diffusers/pipelines/bria/pipeline_bria.py b/src/diffusers/pipelines/bria/pipeline_bria.py index 95ae9ce96e7e..9b80278af21e 100644 --- a/src/diffusers/pipelines/bria/pipeline_bria.py +++ b/src/diffusers/pipelines/bria/pipeline_bria.py @@ -545,6 +545,11 @@ def __call__( list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`. + clip_value (`float`, *optional*): + If set, the predicted noise is clipped to the range `[-clip_value, clip_value]` at each + denoising step. + normalize (`bool`, *optional*, defaults to `False`): + Whether to normalize the predicted noise at each denoising step. Examples: diff --git a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py index c2327bbce1c7..967edff55d95 100644 --- a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py +++ b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py @@ -651,6 +651,9 @@ def __call__( image (`PIL.Image.Image` or `torch.FloatTensor`, *optional*): The image to guide the image generation. If not defined, the pipeline will generate an image from scratch. + mask (`PipelineMaskInput`, *optional*): + Optional mask defining the region of `image` to be edited. Pixels covered by the mask are regenerated + while the rest of the image is preserved. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -711,6 +714,8 @@ def __call__( `._callback_tensor_inputs` attribute of your pipeline class. max_sequence_length (`int` defaults to 3000): Maximum sequence length to use with the `prompt`. do_patching (`bool`, *optional*, defaults to `False`): Whether to use patching. + _auto_resize (`bool`, *optional*, defaults to `True`): + Whether to automatically resize the input image to the preferred resolutions. Examples: Returns: [`~pipelines.flux.BriaFiboPipelineOutput`] or `tuple`: [`~pipelines.flux.BriaFiboPipelineOutput`] if diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py index e1f6e2f8d8af..6dad6a481c5a 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py @@ -739,6 +739,8 @@ def __call__( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is not greater than `1`). + image (`PipelineImageInput`): + The image input for the pipeline. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma_inpainting.py b/src/diffusers/pipelines/chroma/pipeline_chroma_inpainting.py index 52c2f7e51cf2..b8d41a948207 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma_inpainting.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma_inpainting.py @@ -807,10 +807,27 @@ def __call__( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is not greater than `1`). + true_cfg_scale (`float`, *optional*, defaults to 1.0): + True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and + `negative_prompt` is provided. + image (`PipelineImageInput`): + The image input for the pipeline. + mask_image (`PipelineImageInput`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent representation of the masked image. If not provided, it will be computed from + `mask_image` and `image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. This is set to 1024 by default for the best results. + padding_mask_crop (`int`, *optional*, defaults to `None`): + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ratio of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. num_inference_steps (`int`, *optional*, defaults to 35): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py index b883e10a6732..9043abcab65e 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py @@ -561,8 +561,13 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + use_dynamic_cfg (`bool`, *optional*, defaults to `False`): + If True, dynamically adjusts the guidance scale during inference. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py index de5b969a9adc..e2b45a08ee90 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py @@ -606,8 +606,13 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + use_dynamic_cfg (`bool`, *optional*, defaults to `False`): + If True, dynamically adjusts the guidance scale during inference. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py index 9687d63bc7bf..42f5109bb877 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py @@ -657,8 +657,13 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + use_dynamic_cfg (`bool`, *optional*, defaults to `False`): + If True, dynamically adjusts the guidance scale during inference. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py index e3ce8292fad6..3cd72b0c2126 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py @@ -631,8 +631,13 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + use_dynamic_cfg (`bool`, *optional*, defaults to `False`): + If True, dynamically adjusts the guidance scale during inference. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py b/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py index 8880e3a0d1e2..c433c1b54477 100644 --- a/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py +++ b/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py @@ -458,6 +458,9 @@ def __call__( the text `prompt`, usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to `1`): The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -488,10 +491,6 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead of a plain tuple. - attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py index 6282bf4cd7a4..ba25c0ef92e6 100644 --- a/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py +++ b/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py @@ -468,6 +468,11 @@ def __call__( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + control_image (`PipelineImageInput`): + The ControlNet input condition to provide guidance to the `transformer` for generation. If the type is + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. If not provided, it is set to 1024. width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): diff --git a/src/diffusers/pipelines/consisid/pipeline_consisid.py b/src/diffusers/pipelines/consisid/pipeline_consisid.py index 20b779bf5aaa..801d892b0916 100644 --- a/src/diffusers/pipelines/consisid/pipeline_consisid.py +++ b/src/diffusers/pipelines/consisid/pipeline_consisid.py @@ -725,6 +725,9 @@ def __call__( more faithful image generation, while later steps reduce it for more diverse and natural results. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 86fa135abff4..fb3dc94d6b56 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -1003,12 +1003,6 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function is called. If not specified, the callback is called at - every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py index 482a6b52e19b..8cb6721149f5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py @@ -284,8 +284,6 @@ def __call__( The height of the generated image. width (`int`, *optional*, defaults to 512): The width of the generated image. - seed (`int`, *optional*, defaults to 42): - The seed to use for random generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -300,6 +298,10 @@ def __call__( to amplify the prompt. prompt_reps (`int`, *optional*, defaults to 20): The number of times the prompt is repeated along with prompt_strength to amplify the prompt. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Examples: Returns: diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 942bcb49083e..f27fcd8aa26f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -1227,6 +1227,13 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. + control_image (`PipelineImageInput` or `list[PipelineImageInput]`, *optional*): + The ControlNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -1319,6 +1326,20 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The ControlNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). original_size (`tuple[int]`, *optional*, defaults to (1024, 1024)): If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py index 4b7ca284d636..511611f036b4 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py @@ -1323,6 +1323,9 @@ def __call__( available control modes. If multiple ControlNets are specified in `init`, control_mode should be a list where each ControlNet should have its corresponding control mode list. Should reflect the order of conditions in control_image. + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). original_size (`tuple[int]`, *optional*, defaults to (1024, 1024)): If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as diff --git a/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py b/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py index 8882b561f0a1..ba241bf4feb6 100644 --- a/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +++ b/src/diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py @@ -679,10 +679,6 @@ def __call__( guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0): - The percentage of total steps at which the ControlNet starts applying. - control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0): - The percentage of total steps at which the ControlNet stops applying. control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, `list[np.ndarray]`,: `list[list[torch.Tensor]]`, `list[list[np.ndarray]]` or `list[list[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is @@ -706,6 +702,10 @@ def __call__( generator (`torch.Generator` or `list[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py index a787a34bdc01..4530a424adb4 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py @@ -950,6 +950,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py index 96f53b16cbe8..d2890d55811c 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py @@ -1122,6 +1122,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py index 4a849f380ef2..c2c5e6d2c824 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py @@ -586,6 +586,10 @@ def __call__( Optional input video for Video2World conditioning. Must be `None` when `image` is provided. prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `704`): The height in pixels of the generated image. width (`int`, defaults to `1280`): @@ -635,6 +639,8 @@ def __call__( Number of latent conditional frames to use for Video2World conditioning. The number of pixel frames extracted from the input video is calculated as `4 * (num_latent_conditional_frames - 1) + 1`. Set to 1 for Image2World-like behavior (single frame conditioning). + conditional_frame_timestep (`float`, *optional*, defaults to 0.0001): + Timestep value used for the conditional frames during denoising. Examples: diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py index b04b921d596a..e38d926bbd28 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_transfer.py @@ -615,6 +615,10 @@ def __call__( The scale factor(s) for the ControlNet outputs. A single float is broadcast to all control blocks. prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `704`): The height in pixels of the generated image. width (`int`, *optional*): @@ -623,6 +627,9 @@ def __call__( num_frames (`int`, *optional*): Number of output frames. Defaults to `None` to output the same number of frames as the input `controls`. + num_frames_per_chunk (`int`, *optional*, defaults to `93`): + Number of frames generated per auto-regressive chunk. When the total number of frames exceeds this + value, generation is split into multiple chunks using a sliding-window approach. num_inference_steps (`int`, defaults to `36`): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -662,6 +669,8 @@ def __call__( max_sequence_length (`int`, defaults to `512`): The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If the prompt is shorter than this length, it will be padded. + conditional_frame_timestep (`float`, *optional*, defaults to 0.1): + Timestep value used for the conditional frames during denoising. Must be in the `[0, 1]` interval. num_ar_conditional_frames (`int`, *optional*, defaults to `1`): Number of frames to condition on subsequent inference loops in auto-regressive inference, i.e. for the second chunk and onwards. Only used if `num_ar_latent_conditional_frames` is `None`. diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py index f24e19eea0d4..8c6de18b3a9a 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py @@ -442,6 +442,10 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `768`): The height in pixels of the generated image. width (`int`, defaults to `1360`): @@ -482,6 +486,9 @@ def __call__( The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `512`): + The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If + the prompt is shorter than this length, it will be padded. Examples: diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py index bdb13af06637..2a708e1118e0 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py @@ -519,6 +519,10 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `704`): The height in pixels of the generated image. width (`int`, defaults to `1280`): diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py index e144d62d5933..61d9ec8f0574 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py @@ -428,6 +428,10 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `720`): The height in pixels of the generated image. width (`int`, defaults to `1280`): @@ -472,6 +476,9 @@ def __call__( The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `512`): + The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If + the prompt is shorter than this length, it will be padded. Examples: diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py index 377c3c05d284..bf7e28584967 100644 --- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py @@ -541,9 +541,17 @@ def __call__( The call function to the pipeline for generation. Args: + image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*): + The image to be used as a conditioning input for the video generation. + video (`list[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*): + The video to be used as a conditioning input for the video generation. prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + not greater than `1`). height (`int`, defaults to `720`): The height in pixels of the generated image. width (`int`, defaults to `1280`): @@ -558,6 +566,10 @@ def __call__( Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. + input_frames_guidance (`bool`, *optional*, defaults to `False`): + Whether to apply guidance on the conditional input frames. + augment_sigma (`float`, *optional*, defaults to 0.001): + Sigma value used to augment the conditional latents during denoising. fps (`int`, defaults to `30`): The frames per second of the generated video. num_videos_per_prompt (`int`, *optional*, defaults to 1): @@ -588,6 +600,9 @@ def __call__( The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `512`): + The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If + the prompt is shorter than this length, it will be padded. Examples: diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index 9fab42916e9e..1094ecf09a01 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -745,6 +745,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index b6cd51c6d203..f3c35e7c8213 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -819,6 +819,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. diff --git a/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index 70a65e2ef5be..4490e9678503 100644 --- a/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -62,6 +62,9 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py b/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py index cf189a1f18e2..93366d10eb9e 100644 --- a/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/deprecated/pia/pipeline_pia.py @@ -727,6 +727,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. diff --git a/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py index c6abdba42d3c..688b83e4085c 100644 --- a/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +++ b/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py @@ -57,6 +57,9 @@ def __call__( Args: batch_size (`int`, *optional*, defaults to 1): The number of images to generate. + num_inference_steps (`int`, *optional*, defaults to 2000): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. generator (`torch.Generator`, `optional`): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 269e7405d10d..c924bf7a1166 100644 --- a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -137,18 +137,13 @@ def __call__( callback: Callable[[int, int, torch.Tensor], None] | None = None, callback_steps: int = 1, ) -> AudioPipelineOutput | tuple: - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) r""" The call function to the pipeline for generation. Args: input_tokens (`list[list[int]]`): + The tokenized MIDI inputs to generate audio from. Each element is a list of integer tokens produced by + the `MidiProcessor`. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -186,6 +181,13 @@ def __call__( If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is returned where the first element is a list with the generated audio. """ + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32) full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index ce5d3397ed47..38f5af842e1b 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -628,10 +628,6 @@ def __call__( cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when - using zero terminal SNR. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 16b21dd66132..70a16f5d522f 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -838,6 +838,10 @@ def __call__( cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when + using zero terminal SNR. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index dae5e600d773..a4fef21ab82b 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -657,6 +657,9 @@ def __call__( Args: prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. + source_prompt (`str` or `list[str]`): + The prompt or prompts describing the input `image`. Used together with `prompt` to guide the + cycle-diffusion editing process. image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. @@ -686,9 +689,6 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index 0955b6fe48a1..f88c6d8fbc30 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -903,6 +903,9 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py index f67008fb98c3..33d1c378fcc0 100644 --- a/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/deprecated/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -490,8 +490,6 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index 8f8fb712e023..067af4c0794c 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -408,6 +408,11 @@ def __call__( Args: prompt (`str` or `list[str]`): The prompt or prompts to guide image generation. + image (`PIL.Image.Image` or `list[PIL.Image.Image]`): + The image or images to condition the generation on alongside `prompt`. + text_to_image_strength (`float`, *optional*, defaults to 0.5): + Mixing ratio between the text and image conditioning. A value of 1.0 corresponds to pure text-to-image, + while 0.0 corresponds to pure image variation. height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`): @@ -418,9 +423,6 @@ def __call__( guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `list[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): diff --git a/src/diffusers/pipelines/deprecated/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/deprecated/wuerstchen/pipeline_wuerstchen.py index b57fc732b5f5..b935733b744e 100644 --- a/src/diffusers/pipelines/deprecated/wuerstchen/pipeline_wuerstchen.py +++ b/src/diffusers/pipelines/deprecated/wuerstchen/pipeline_wuerstchen.py @@ -236,7 +236,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.Tensor` or `list[torch.Tensor]`): + image_embeddings (`torch.Tensor` or `list[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. diff --git a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate.py b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate.py index 6ec8f44e6d1a..72e19a8cce1f 100755 --- a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate.py +++ b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate.py @@ -550,7 +550,7 @@ def __call__( r""" Generates images or video using the EasyAnimate pipeline based on the provided prompts. - Examples: + Args: prompt (`str` or `list[str]`, *optional*): Text prompts to guide the image or video generation. If not provided, use `prompt_embeds` instead. num_frames (`int`, *optional*): @@ -592,12 +592,11 @@ def __call__( Tensor names to be included in callback function calls. guidance_rescale (`float`, *optional*, defaults to 0.0): Adjusts noise levels based on guidance scale. - original_size (`tuple[int, int]`, *optional*, defaults to `(1024, 1024)`): - Original dimensions of the output. - target_size (`tuple[int, int]`, *optional*): - Desired output dimensions for calculations. - crops_coords_top_left (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): - Coordinates for cropping. + timesteps (`list[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, the scheduler's default schedule for + `num_inference_steps` is used. + + Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: diff --git a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py index 5e07996a661c..4ad3a48b70ec 100755 --- a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +++ b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py @@ -699,7 +699,7 @@ def __call__( r""" Generates images or video using the EasyAnimate pipeline based on the provided prompts. - Examples: + Args: prompt (`str` or `list[str]`, *optional*): Text prompts to guide the image or video generation. If not provided, use `prompt_embeds` instead. num_frames (`int`, *optional*): @@ -708,6 +708,12 @@ def __call__( Height of the generated image in pixels. width (`int`, *optional*): Width of the generated image in pixels. + control_video (`torch.FloatTensor`, *optional*): + Control video used to condition the generation. + control_camera_video (`torch.FloatTensor`, *optional*): + Control camera video used to condition the generation. + ref_image (`torch.FloatTensor`, *optional*): + Reference image used to condition the generation. num_inference_steps (`int`, *optional*, defaults to 50): Number of denoising steps during generation. More steps generally yield higher quality images but slow down inference. @@ -741,6 +747,11 @@ def __call__( Tensor names to be included in callback function calls. guidance_rescale (`float`, *optional*, defaults to 0.0): Adjusts noise levels based on guidance scale. + timesteps (`list[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, the scheduler's default schedule for + `num_inference_steps` is used. + + Examples: Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: diff --git a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py index 872313898008..69bb332944d6 100755 --- a/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +++ b/src/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py @@ -819,7 +819,7 @@ def __call__( r""" The call function to the pipeline for generation with HunyuanDiT. - Examples: + Args: prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. num_frames (`int`, *optional*): @@ -886,6 +886,11 @@ def __call__( strength (`float`, *optional*, defaults to 1.0): Affects the overall styling or quality of the generated output. Values closer to 1 usually provide direct adherence to prompts. + noise_aug_strength (`float`, *optional*, defaults to 0.0563): + Strength of the noise augmentation applied to the conditioning video latents. + timesteps (`list[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, the scheduler's default schedule for + `num_inference_steps` is used. Examples: # Example usage of the function for generating images based on prompts. diff --git a/src/diffusers/pipelines/flux/pipeline_flux_control_inpaint.py b/src/diffusers/pipelines/flux/pipeline_flux_control_inpaint.py index 2d1e05493a11..cd4ee9fe7611 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_control_inpaint.py @@ -861,7 +861,7 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): + masked_image_latents (`torch.Tensor`, `list[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will be generated by `mask_image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py index d8dcdfcd4640..da81563e4a66 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py @@ -722,6 +722,16 @@ def __call__( prompt_2 (`str` or `list[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is will be used instead + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + negative_prompt_2 (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. + true_cfg_scale (`float`, *optional*, defaults to 1.0): + True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and + `negative_prompt` is provided. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -772,6 +782,14 @@ def __call__( pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py index fdaff9b0af8a..65b2072a7746 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py @@ -695,6 +695,10 @@ def __call__( controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0): The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added to the residual in the original transformer. + control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index cf929f53fc6d..4098213cc894 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -798,7 +798,7 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): + masked_image_latents (`torch.Tensor`, `list[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will be generated by `mask_image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/src/diffusers/pipelines/flux/pipeline_flux_img2img.py b/src/diffusers/pipelines/flux/pipeline_flux_img2img.py index cadff7736ff4..51229a1c603e 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_img2img.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_img2img.py @@ -775,6 +775,16 @@ def __call__( prompt_2 (`str` or `list[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is will be used instead + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + negative_prompt_2 (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. + true_cfg_scale (`float`, *optional*, defaults to 1.0): + True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and + `negative_prompt` is provided. image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list @@ -819,6 +829,14 @@ def __call__( pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of diff --git a/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py b/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py index b8ce25a4f5a9..914274397944 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py @@ -819,6 +819,16 @@ def __call__( prompt_2 (`str` or `list[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is will be used instead + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + negative_prompt_2 (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. + true_cfg_scale (`float`, *optional*, defaults to 1.0): + True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and + `negative_prompt` is provided. image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list @@ -832,7 +842,7 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): + masked_image_latents (`torch.Tensor`, `list[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will be generated by `mask_image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -880,6 +890,14 @@ def __call__( pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of diff --git a/src/diffusers/pipelines/flux/pipeline_flux_kontext.py b/src/diffusers/pipelines/flux/pipeline_flux_kontext.py index f4bbe42ef850..efddc6cea139 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_kontext.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_kontext.py @@ -883,6 +883,8 @@ def __call__( max_area (`int`, defaults to `1024 ** 2`): The maximum area of the generated image in pixels. The height and width will be adjusted to fit this area while maintaining the aspect ratio. + _auto_resize (`bool`, *optional*, defaults to `True`): + Whether to automatically resize the input image to the preferred resolutions. Examples: diff --git a/src/diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py b/src/diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py index 313682dc7e33..c85299eedcd3 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py @@ -1104,6 +1104,8 @@ def __call__( max_area (`int`, defaults to `1024 ** 2`): The maximum area of the generated image in pixels. The height and width will be adjusted to fit this area while maintaining the aspect ratio. + _auto_resize (`bool`, *optional*, defaults to `True`): + Whether to automatically resize the input image to the preferred resolutions. Examples: diff --git a/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py b/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py index 330e2623b287..94c7bcc80782 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py @@ -398,6 +398,12 @@ def __call__( Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. + prompt_embeds_scale (`float` or `list[float]`, *optional*, defaults to 1.0): + Scale factor (or per-image list of scale factors) applied to the redux prompt embeddings before they + are returned. + pooled_prompt_embeds_scale (`float` or `list[float]`, *optional*, defaults to 1.0): + Scale factor (or per-image list of scale factors) applied to the redux pooled prompt embeddings before + they are returned. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.flux.FluxPriorReduxPipelineOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py index 859b371b2514..8794e8195771 100644 --- a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py +++ b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py @@ -768,14 +768,44 @@ def __call__( The width in pixels. If not provided, derived from prompt shape info. num_inference_steps (`int`, *optional*, defaults to `50`): The number of denoising steps for DiT. + timesteps (`list[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, the scheduler's default schedule for + `num_inference_steps` is used. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process. If not defined, the scheduler's default schedule is + used. guidance_scale (`float`, *optional*, defaults to `1.5`): Guidance scale for classifier-free guidance. num_images_per_prompt (`int`, *optional*, defaults to `1`): The number of images to generate per prompt. generator (`torch.Generator`, *optional*): Random generator for reproducibility. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents to be used as inputs for image generation. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. If not provided, embeddings are generated from `prompt`. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Used when classifier-free guidance is enabled. + prior_token_ids (`torch.Tensor`, *optional*): + Pre-generated prior token ids from `generate_prior_tokens`. If supplied, prior generation is skipped. + prior_token_image_ids (`list[torch.Tensor]`, *optional*): + Image token ids associated with `prior_token_ids`. + source_image_grid_thw (`list[torch.Tensor]`, *optional*): + Per-sample THW grid information for the source image tokens. + crops_coords_top_left (`tuple[int, int]`, *optional*, defaults to `(0, 0)`): + The top-left coordinates of the crop used for conditioning embeddings. output_type (`str`, *optional*, defaults to `"pil"`): Output format: "pil", "np", or "latent". + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`GlmImagePipelineOutput`] instead of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function called at the end of each denoising step. + callback_on_step_end_tensor_inputs (`list[str]`, *optional*): + Tensor inputs passed to `callback_on_step_end`. + max_sequence_length (`int`, *optional*, defaults to `2048`): + Maximum sequence length for the text encoder. Examples: diff --git a/src/diffusers/pipelines/helios/pipeline_helios.py b/src/diffusers/pipelines/helios/pipeline_helios.py index 87a8600badab..90ac654bc77c 100644 --- a/src/diffusers/pipelines/helios/pipeline_helios.py +++ b/src/diffusers/pipelines/helios/pipeline_helios.py @@ -502,6 +502,9 @@ def __call__( num_inference_steps (`int`, defaults to `50`): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process. If not defined, the scheduler's default schedule is + used. guidance_scale (`float`, defaults to `5.0`): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. @@ -520,6 +523,8 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. If not provided, they are generated from `negative_prompt`. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -540,6 +545,36 @@ def __call__( max_sequence_length (`int`, defaults to `512`): The maximum sequence length of the text encoder. If the prompt is longer than this, it will be truncated. If the prompt is shorter, it will be padded to this length. + image (`PipelineImageInput`, *optional*): + Input image used for image-to-video conditioning. + image_latents (`torch.Tensor`, *optional*): + Pre-encoded image latents to use instead of `image`. + fake_image_latents (`torch.Tensor`, *optional*): + Optional fake image latents used during conditioning. + add_noise_to_image_latents (`bool`, *optional*, defaults to `True`): + Whether to add noise to the image latents prior to denoising. + image_noise_sigma_min (`float`, *optional*, defaults to `0.111`): + Minimum sigma value for noise added to image latents. + image_noise_sigma_max (`float`, *optional*, defaults to `0.135`): + Maximum sigma value for noise added to image latents. + video (`PipelineImageInput`, *optional*): + Input video used for video-to-video conditioning. + video_latents (`torch.Tensor`, *optional*): + Pre-encoded video latents to use instead of `video`. + add_noise_to_video_latents (`bool`, *optional*, defaults to `True`): + Whether to add noise to the video latents prior to denoising. + video_noise_sigma_min (`float`, *optional*, defaults to `0.111`): + Minimum sigma value for noise added to video latents. + video_noise_sigma_max (`float`, *optional*, defaults to `0.135`): + Maximum sigma value for noise added to video latents. + history_sizes (`list`, *optional*, defaults to `[16, 2, 1]`): + History window sizes used for autoregressive chunked generation. + num_latent_frames_per_chunk (`int`, *optional*, defaults to `9`): + Number of latent frames produced per chunk during autoregressive generation. + keep_first_frame (`bool`, *optional*, defaults to `True`): + Whether to retain the first frame across chunks. + is_skip_first_chunk (`bool`, *optional*, defaults to `False`): + Whether to skip generation of the first chunk. Examples: diff --git a/src/diffusers/pipelines/helios/pipeline_helios_pyramid.py b/src/diffusers/pipelines/helios/pipeline_helios_pyramid.py index 1791da11b490..c187e436a857 100644 --- a/src/diffusers/pipelines/helios/pipeline_helios_pyramid.py +++ b/src/diffusers/pipelines/helios/pipeline_helios_pyramid.py @@ -568,6 +568,9 @@ def __call__( The width in pixels of the generated image. num_frames (`int`, defaults to `132`): The number of frames in the generated video. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process. If not defined, the scheduler's default schedule is + used. guidance_scale (`float`, defaults to `5.0`): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. @@ -586,6 +589,8 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. If not provided, they are generated from `negative_prompt`. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -606,6 +611,44 @@ def __call__( max_sequence_length (`int`, defaults to `512`): The maximum sequence length of the text encoder. If the prompt is longer than this, it will be truncated. If the prompt is shorter, it will be padded to this length. + image (`PipelineImageInput`, *optional*): + Input image used for image-to-video conditioning. + image_latents (`torch.Tensor`, *optional*): + Pre-encoded image latents to use instead of `image`. + fake_image_latents (`torch.Tensor`, *optional*): + Optional fake image latents used during conditioning. + add_noise_to_image_latents (`bool`, *optional*, defaults to `True`): + Whether to add noise to the image latents prior to denoising. + image_noise_sigma_min (`float`, *optional*, defaults to `0.111`): + Minimum sigma value for noise added to image latents. + image_noise_sigma_max (`float`, *optional*, defaults to `0.135`): + Maximum sigma value for noise added to image latents. + video (`PipelineImageInput`, *optional*): + Input video used for video-to-video conditioning. + video_latents (`torch.Tensor`, *optional*): + Pre-encoded video latents to use instead of `video`. + add_noise_to_video_latents (`bool`, *optional*, defaults to `True`): + Whether to add noise to the video latents prior to denoising. + video_noise_sigma_min (`float`, *optional*, defaults to `0.111`): + Minimum sigma value for noise added to video latents. + video_noise_sigma_max (`float`, *optional*, defaults to `0.135`): + Maximum sigma value for noise added to video latents. + history_sizes (`list`, *optional*, defaults to `[16, 2, 1]`): + History window sizes used for autoregressive chunked generation. + num_latent_frames_per_chunk (`int`, *optional*, defaults to `9`): + Number of latent frames produced per chunk during autoregressive generation. + keep_first_frame (`bool`, *optional*, defaults to `True`): + Whether to retain the first frame across chunks. + is_skip_first_chunk (`bool`, *optional*, defaults to `False`): + Whether to skip generation of the first chunk. + pyramid_num_inference_steps_list (`list`, *optional*, defaults to `[10, 10, 10]`): + Number of inference steps for each pyramid stage during Stage 2 generation. + use_zero_init (`bool`, *optional*, defaults to `True`): + Whether to apply CFG zero-init at the start of denoising. + zero_steps (`int`, *optional*, defaults to `1`): + Number of initial steps that use CFG zero-init. + is_amplify_first_chunk (`bool`, *optional*, defaults to `False`): + Whether to amplify guidance on the first chunk (DMD-related). Examples: diff --git a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py index 8e5e078cc2af..1c73dfacccdb 100644 --- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py +++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py @@ -813,13 +813,18 @@ def __call__( Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. + prompt_embeds_t5 (`torch.FloatTensor`, *optional*): + Pre-generated T5 text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If + not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_llama3 (`torch.FloatTensor`, *optional*): + Pre-generated LLaMA3 text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds_t5 (`torch.FloatTensor`, *optional*): + Pre-generated negative T5 text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, embeddings will be generated from `negative_prompt` input argument. + negative_prompt_embeds_llama3 (`torch.FloatTensor`, *optional*): + Pre-generated negative LLaMA3 text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, embeddings will be generated from `negative_prompt` input argument. pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/hunyuan_image/pipeline_hunyuanimage_refiner.py b/src/diffusers/pipelines/hunyuan_image/pipeline_hunyuanimage_refiner.py index 93e4deb2974a..efdb5505e604 100644 --- a/src/diffusers/pipelines/hunyuan_image/pipeline_hunyuanimage_refiner.py +++ b/src/diffusers/pipelines/hunyuan_image/pipeline_hunyuanimage_refiner.py @@ -476,6 +476,8 @@ def __call__( images that are closely linked to the text `prompt`, usually at the expense of lower image quality. For guidance distilled models, this parameter is required. For non-distilled models, this parameter will be ignored. + image (`PipelineImageInput`, *optional*): + The input image to be refined. num_images_per_prompt (`int`, *optional*, defaults to 1): height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. @@ -500,10 +502,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py index 1a7cae256d63..b5b4ff9bcd85 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py @@ -568,6 +568,8 @@ def __call__( The call function to the pipeline for generation. Args: + image (`PipelineImageInput`): + The input image to condition the generation on. prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. @@ -627,6 +629,10 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. Required when `prompt_embeds` is passed directly. + negative_prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. Required when `negative_prompt_embeds` is passed directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -635,9 +641,10 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. + prompt_template (`dict`, *optional*): + Template used to format the prompt before encoding. Defaults to the model's default template. + max_sequence_length (`int`, *optional*, defaults to 256): + Maximum sequence length to use for the prompt encoder. callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of each denoising step during the inference. with the following arguments: `callback_on_step_end(self: diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py index 3c6ec39398ef..5b8cff2ca0c5 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py @@ -582,6 +582,10 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. Required when `prompt_embeds` is passed directly. + negative_prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. Required when `negative_prompt_embeds` is passed directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -590,9 +594,10 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. + prompt_template (`dict`, *optional*): + Template used to format the prompt before encoding. Defaults to the model's default template. + max_sequence_length (`int`, *optional*, defaults to 256): + Maximum sequence length to use for the prompt encoder. callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of each denoising step during the inference. with the following arguments: `callback_on_step_end(self: diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py index f82f26eea5b9..515b530d1037 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py @@ -701,6 +701,8 @@ def __call__( The width in pixels of the generated image. num_frames (`int`, defaults to `129`): The number of frames in the generated video. + latent_window_size (`int`, defaults to `9`): + Number of latent frames produced per Framepack sampling window. num_inference_steps (`int`, defaults to `50`): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -741,6 +743,10 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. Required when `prompt_embeds` is passed directly. + negative_prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. Required when `negative_prompt_embeds` is passed directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -749,9 +755,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. + prompt_template (`dict`, *optional*): + Template used to format the prompt before encoding. Defaults to the model's default template. + max_sequence_length (`int`, *optional*, defaults to 256): + Maximum sequence length to use for the prompt encoder. + sampling_type (`FramepackSamplingType`, *optional*): + The Framepack sampling strategy to use when iterating over latent windows. callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of each denoising step during the inference. with the following arguments: `callback_on_step_end(self: diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py index c7d43424c344..1c68be879013 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py @@ -709,6 +709,8 @@ def __call__( The call function to the pipeline for generation. Args: + image (`PIL.Image.Image`): + The input image to condition the video generation on. prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. @@ -768,6 +770,10 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. Required when `prompt_embeds` is passed directly. + negative_prompt_attention_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. Required when `negative_prompt_embeds` is passed directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -776,9 +782,13 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. + prompt_template (`dict`, *optional*): + Template used to format the prompt before encoding. Defaults to the model's default template. + max_sequence_length (`int`, *optional*, defaults to 256): + Maximum sequence length to use for the prompt encoder. + image_embed_interleave (`int`, *optional*): + Number of image embedding tokens to interleave with text tokens. If not provided, a sensible default is + chosen based on the transformer's `image_condition_type`. callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of each denoising step during the inference. with the following arguments: `callback_on_step_end(self: diff --git a/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py b/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py index b908dd5dfe83..5d656a3c370a 100644 --- a/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +++ b/src/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py @@ -624,6 +624,10 @@ def __call__( generator (`torch.Generator` or `list[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 26e163a70142..f9e772c905c8 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -266,6 +266,12 @@ def __call__( output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`) or `"pt"` (`torch.Tensor`). + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. prior_callback_on_step_end (`Callable`, *optional*): @@ -524,6 +530,23 @@ def __call__( every step. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + prior_callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference of the prior pipeline. + The function is called with the following arguments: `prior_callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. + prior_callback_on_step_end_tensor_inputs (`list`, *optional*): + The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the + list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in + the `._callback_tensor_inputs` attribute of your prior pipeline class. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference of the decoder pipeline. + The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, + step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors + as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`list`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py index 9f5340557125..5db5cd38f07e 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py @@ -179,17 +179,12 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - prompt (`str` or `list[str]`): - The prompt or prompts to guide the image generation. hint (`torch.Tensor`): The controlnet condition. image_embeds (`torch.Tensor` or `list[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. negative_image_embeds (`torch.Tensor` or `list[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. - negative_prompt (`str` or `list[str]`, *optional*): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. width (`int`, *optional*, defaults to 512): diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py index adbc3a5badc5..72f1d8556ec5 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py @@ -417,12 +417,14 @@ def __call__( Args: prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. + image (`torch.Tensor`, `PIL.Image.Image`, `list[torch.Tensor]` or `list[PIL.Image.Image]`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the image + embedding. Can also accept image latents as `image`, if passing latents directly, it will not be + encoded again. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image` + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. - emb (`torch.Tensor`): - The image embedding. negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index 97353c95c9c7..ca8f124c74cf 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -364,9 +364,6 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 25): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`list[int]`, *optional*): - Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` - timesteps are used. Must be in descending order. guidance_scale (`float`, *optional*, defaults to 3.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. @@ -383,9 +380,6 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size): The width in pixels of the generated image. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only - applies to [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -405,20 +399,19 @@ def __call__( [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - clean_caption (`bool`, *optional*, defaults to `True`): - Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to - be installed. If the dependencies are not installed, the embeddings will be created from the raw - prompt. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`list`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. Examples: diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py index 1c94a8219e2a..1ce885b21f5b 100644 --- a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py @@ -730,10 +730,19 @@ def __call__( A torch generator to make generation deterministic. latents (`torch.Tensor`, *optional*): Pre-generated noisy latents. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. + prompt_embeds_qwen (`torch.Tensor`, *optional*): + Pre-generated text embeddings from the Qwen text encoder. + prompt_embeds_clip (`torch.Tensor`, *optional*): + Pre-generated text embeddings from the CLIP text encoder. + negative_prompt_embeds_qwen (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings from the Qwen text encoder. + negative_prompt_embeds_clip (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings from the CLIP text encoder. + prompt_cu_seqlens (`torch.Tensor`, *optional*): + Cumulative sequence lengths for the Qwen prompt embeddings, used for variable-length attention. + negative_prompt_cu_seqlens (`torch.Tensor`, *optional*): + Cumulative sequence lengths for the Qwen negative prompt embeddings, used for variable-length + attention. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 6861438e4c63..424a2c46e06b 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -737,13 +737,18 @@ def __call__( Args: prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): + `Image` or tensor representing an image batch to be used as the starting point. Can also accept image + latents as `image`, but if passing latents directly it is not encoded again. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + strength (`float`, *optional*, defaults to 0.8): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. original_inference_steps (`int`, *optional*): The original number of inference steps use to generate a linearly-spaced timestep schedule, from which we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule, diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index 458e6dbfe7d2..a4042b05c97e 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -102,6 +102,9 @@ def __call__( guidance_scale (`float`, *optional*, defaults to 1.0): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only + applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index a136770b9f26..70a61fab1be2 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -903,12 +903,6 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in @@ -935,7 +929,7 @@ def __call__( editing_prompt_embeddings (`torch.Tensor`, *optional*): Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument. - editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*): + editing_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument. diff --git a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py index 19720d7bbab8..4eaa858e41c1 100644 --- a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py +++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py @@ -495,11 +495,46 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - enable_cfg_renorm: Whether to enable cfg_renorm. Enabling cfg_renorm will improve image quality, - but it may lead to a decrease in the stability of some image outputs.. - cfg_renorm_min: The minimum value of the cfg_renorm_scale range (0-1). - cfg_renorm_min = 1.0, renorm has no effect, while cfg_renorm_min=0.0, the renorm range is larger. - enable_prompt_rewrite: whether to enable prompt rewrite. + prompt (`str` or `list[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process. If not defined, the scheduler's default schedule is + used. + guidance_scale (`float`, *optional*, defaults to 4.5): + Classifier-free guidance scale. Values greater than 1 enable CFG. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `list[torch.Generator]`, *optional*): + A `torch.Generator` to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents to be used as inputs for image generation. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. If not provided, embeddings are generated from `prompt`. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Used when classifier-free guidance is enabled. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.LongCatImagePipelineOutput`] instead of a plain tuple. + joint_attention_kwargs (`dict`, *optional*): + Kwargs passed to the joint attention processor. + enable_cfg_renorm (`bool`, *optional*, defaults to `True`): + Whether to enable cfg_renorm. Enabling cfg_renorm will improve image quality, but it may lead to a + decrease in the stability of some image outputs. + cfg_renorm_min (`float`, *optional*, defaults to 0.0): + The minimum value of the cfg_renorm_scale range (0-1). `cfg_renorm_min = 1.0` disables renorm, while + `cfg_renorm_min = 0.0` widens the renorm range. + enable_prompt_rewrite (`bool`, *optional*, defaults to `True`): + Whether to enable prompt rewrite. + Examples: Returns: diff --git a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py index 69d5d82f18ec..119de3946fbc 100644 --- a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py +++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py @@ -550,6 +550,37 @@ def __call__( r""" Function invoked when calling the pipeline for generation. + Args: + image (`PIL.Image.Image`, *optional*): + The input image to edit. + prompt (`str` or `list[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list[float]`, *optional*): + Custom sigmas to use for the denoising process. If not defined, the scheduler's default schedule is + used. + guidance_scale (`float`, *optional*, defaults to 4.5): + Classifier-free guidance scale. Values greater than 1 enable CFG. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `list[torch.Generator]`, *optional*): + A `torch.Generator` to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents to be used as inputs for image generation. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. If not provided, embeddings are generated from `prompt`. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Used when classifier-free guidance is enabled. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.LongCatImagePipelineOutput`] instead of a plain tuple. + joint_attention_kwargs (`dict`, *optional*): + Kwargs passed to the joint attention processor. + Examples: Returns: diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py index e2514c3bca24..ce9177547c52 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx.py @@ -569,12 +569,17 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, defaults to `704`): The width in pixels of the generated image. This is set to 848 by default for the best results. num_frames (`int`, defaults to `161`): The number of video frames to generate + frame_rate (`int`, defaults to `25`): + Target frame rate of the generated video. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py index 539a28f56e67..28d296695998 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py @@ -906,12 +906,17 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, defaults to `704`): The width in pixels of the generated image. This is set to 848 by default for the best results. num_frames (`int`, defaults to `161`): The number of video frames to generate + frame_rate (`int`, defaults to `25`): + Target frame rate of the generated video. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -931,6 +936,8 @@ def __call__( [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when using zero terminal SNR. + image_cond_noise_scale (`float`, defaults to `0.15`): + Scale of noise added to the conditioning image latents. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py index 497f505c4dd8..81ecfce50efa 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py @@ -633,12 +633,17 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, defaults to `704`): The width in pixels of the generated image. This is set to 848 by default for the best results. num_frames (`int`, defaults to `161`): The number of video frames to generate + frame_rate (`int`, defaults to `25`): + Target frame rate of the generated video. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py b/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py index 17d4e1d8fc57..315dcc04cb30 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py @@ -253,6 +253,34 @@ def __call__( output_type: str | None = "pil", return_dict: bool = True, ): + r""" + Function invoked when calling the pipeline for latent upsampling. + + Args: + video (`list[PipelineImageInput]`, *optional*): + The input video frames to upsample. Mutually exclusive with `latents`. + height (`int`, defaults to `512`): + The height in pixels of the upsampled output. + width (`int`, defaults to `704`): + The width in pixels of the upsampled output. + latents (`torch.Tensor`, *optional*): + Pre-encoded video latents to upsample. Mutually exclusive with `video`. + decode_timestep (`float` or `list[float]`, defaults to `0.0`): + The timestep at which the upsampled latents are decoded. + decode_noise_scale (`float` or `list[float]`, *optional*): + Interpolation factor between random noise and denoised latents at the decode timestep. + adain_factor (`float`, defaults to `0.0`): + Strength of AdaIN statistical matching applied to the upsampled latents. + tone_map_compression_ratio (`float`, defaults to `0.0`): + Compression ratio used for tone mapping the upsampled latents. Must be in the range [0, 1]. + generator (`torch.Generator` or `list[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated video. Choose between `PIL.Image`, `np.array`, or `latent`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ltx.LTXPipelineOutput`] instead of a plain tuple. + """ self.check_inputs( video=video, height=height, diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py index 946360445e61..ba32f6ed4c0c 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py @@ -857,6 +857,9 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, *optional*, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, *optional*, defaults to `768`): diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py index 3f63add2eda4..600665966f13 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py @@ -1222,6 +1222,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, *optional*, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, *optional*, defaults to `768`): diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_hdr_lora.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_hdr_lora.py index 53ebf06c27d0..cd8dac962173 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_hdr_lora.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_hdr_lora.py @@ -1135,8 +1135,10 @@ def __call__( connector_audio_embeds (`torch.Tensor`, *optional*): Optional pre-computed connector outputs for the audio modality. Used by the HDR LoRA pipeline; if supplied, will override any `prompt`/`prompt_embeds`. - decode_timestep, decode_noise_scale: + decode_timestep (`float` or `list[float]`, defaults to `0.0`): VAE-decode timestep conditioning (only used by VAE configs with `timestep_conditioning=True`). + decode_noise_scale (`float` or `list[float]`, *optional*): + Interpolation factor between random noise and denoised latents at the decode timestep. use_cross_timestep (`bool`, *optional*, defaults to `False`): Whether to use cross-modality sigma for cross-attention modulation. output_type (`str`, *optional*, defaults to `"pt"`): @@ -1145,8 +1147,14 @@ def __call__( array; `"latent"` returns the raw denoised latents (skip the HDR decode). return_dict (`bool`, *optional*, defaults to `True`): Whether to return an [`LTX2PipelineOutput`] instead of a plain tuple. - attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, max_sequence_length: - Standard hooks and arguments, same as [`LTX2InContextPipeline`]. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor`. + callback_on_step_end (`Callable`, *optional*): + A function called at the end of each denoising step, same as [`LTX2InContextPipeline`]. + callback_on_step_end_tensor_inputs (`list`, *optional*): + The list of tensor inputs passed to `callback_on_step_end`. + max_sequence_length (`int`, *optional*, defaults to `1024`): + Maximum sequence length to use with the `prompt`. Examples: diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py index 997bfd9fc9dc..bf27927ec8cd 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py @@ -920,6 +920,9 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, *optional*, defaults to `512`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, *optional*, defaults to `768`): diff --git a/src/diffusers/pipelines/lucy/pipeline_lucy_edit.py b/src/diffusers/pipelines/lucy/pipeline_lucy_edit.py index 392af492b702..69eb2a02be5c 100644 --- a/src/diffusers/pipelines/lucy/pipeline_lucy_edit.py +++ b/src/diffusers/pipelines/lucy/pipeline_lucy_edit.py @@ -514,6 +514,9 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/lumina/pipeline_lumina.py b/src/diffusers/pipelines/lumina/pipeline_lumina.py index cc123218f4ee..1cfd9b482d8e 100644 --- a/src/diffusers/pipelines/lumina/pipeline_lumina.py +++ b/src/diffusers/pipelines/lumina/pipeline_lumina.py @@ -686,9 +686,6 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size): The width in pixels of the generated image. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only - applies to [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -716,6 +713,10 @@ def __call__( prompt. max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`. + scaling_watershed (`float`, *optional*, defaults to 1.0): + Resolution scaling threshold used by Lumina to switch between standard and extended-context attention. + proportional_attn (`bool`, *optional*, defaults to True): + Whether to scale attention proportionally for high-resolution generation. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/lumina2/pipeline_lumina2.py b/src/diffusers/pipelines/lumina2/pipeline_lumina2.py index 576d3e8d9486..8a7a8925a925 100644 --- a/src/diffusers/pipelines/lumina2/pipeline_lumina2.py +++ b/src/diffusers/pipelines/lumina2/pipeline_lumina2.py @@ -579,9 +579,6 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size): The width in pixels of the generated image. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only - applies to [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py index e8acc0a75e4d..0e791b5f6b20 100644 --- a/src/diffusers/pipelines/mochi/pipeline_mochi.py +++ b/src/diffusers/pipelines/mochi/pipeline_mochi.py @@ -527,6 +527,9 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, *optional*, defaults to `self.default_height`): The height in pixels of the generated image. This is set to 480 by default for the best results. width (`int`, *optional*, defaults to `self.default_width`): diff --git a/src/diffusers/pipelines/nucleusmoe_image/pipeline_nucleusmoe_image.py b/src/diffusers/pipelines/nucleusmoe_image/pipeline_nucleusmoe_image.py index 4bb5f8f532a2..f50f11c8c152 100644 --- a/src/diffusers/pipelines/nucleusmoe_image/pipeline_nucleusmoe_image.py +++ b/src/diffusers/pipelines/nucleusmoe_image/pipeline_nucleusmoe_image.py @@ -411,8 +411,10 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, an empty string is used when `true_cfg_scale > 1`. - true_cfg_scale (`float`, *optional*, defaults to 4.0): + guidance_scale (`float`, *optional*, defaults to 4.0): Classifier-free guidance scale. Values greater than 1 enable CFG. + return_index (`int`, *optional*): + Layer index of the text encoder output to use for the prompt embeddings. height (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`): diff --git a/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py b/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py index 15ac665acd2b..a443a19bd952 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py @@ -640,6 +640,10 @@ def __call__( generator (`torch.Generator` or `list[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py index f0fbef29b699..d86adccc2ccf 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py @@ -786,6 +786,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py index 84b727dc0613..24f3d828bd81 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py @@ -779,6 +779,10 @@ def __call__( prompt_3 (`str` or `list[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is will be used instead + height (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. This is set to 1024 by default for the best results. image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list @@ -847,6 +851,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py index ac13fe22723e..c15865fdd11b 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py @@ -623,6 +623,8 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. @@ -667,6 +669,8 @@ def __call__( The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + decode_chunk_size (`int`, *optional*, defaults to 16): + The number of frames to decode at a time when calling `decode_latents` method. pag_scale (`float`, *optional*, defaults to 3.0): The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention guidance will not be used. diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py index 0f6fbbd9ae16..a61b8ec14f08 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py @@ -948,10 +948,35 @@ def __call__( Args: prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to + be masked out with `mask_image` and repainted according to `prompt`). + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a + single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one + color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, + 1)`, or `(H, W)`. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent of the masked image (for inpainting). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. + padding_mask_crop (`int`, *optional*, defaults to `None`): + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information irrelevant for inpainting, such as background. + strength (`float`, *optional*, defaults to 0.9999): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py index 2987c90626ef..bd960a64f45e 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py @@ -1000,6 +1000,9 @@ def __call__( as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py index 9caf50e5e333..7dadbc495a28 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py @@ -1142,6 +1142,8 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent of the masked image (for inpainting). height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. Anything below 512 pixels won't work well for @@ -1285,6 +1287,9 @@ def __call__( Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to simulate an aesthetic score of the generated image by influencing the negative text condition. + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py index ac0f18b51c7c..eb42547c6d93 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py @@ -536,10 +536,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py index 2afc47804a81..672d4fa8a8b7 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py @@ -631,6 +631,15 @@ def __call__( ignored when not using guidance distilled models. To enable traditional classifier-free guidance, please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should enable classifier-free guidance computations). + control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + control_image (`PipelineImageInput`, *optional*): + The ControlNet input condition to provide guidance for the generation. + controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `transformer`. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): @@ -643,10 +652,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py index bba99da06bb1..ffaee10ce01c 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py @@ -664,6 +664,18 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + control_image (`PipelineImageInput`, *optional*): + The ControlNet input condition to provide guidance for the generation. + control_mask (`PipelineImageInput`, *optional*): + The inpainting mask for the ControlNet input condition. White pixels are repainted while black pixels + are preserved. + controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `transformer`. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): @@ -676,10 +688,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index fdd058830e17..b41cf3688854 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -639,10 +639,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py index 4415fd391b4a..423d0b02219f 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py @@ -750,7 +750,7 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): + masked_image_latents (`torch.Tensor`, `list[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will ge generated by `mask_image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -799,10 +799,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py index 57749e6ce1c2..111694099d7a 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -608,10 +608,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py index 93ccdcc95c10..03741ae6eaf1 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py @@ -624,10 +624,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py index 80f9225697dd..8045466af2d6 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py @@ -705,7 +705,7 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): + masked_image_latents (`torch.Tensor`, `list[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will be generated by `mask_image`. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -754,10 +754,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py index e8dbfaafb9f0..a227e6cfb3e6 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py @@ -588,6 +588,8 @@ def __call__( enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + layers (`int`, *optional*, defaults to 4): + Number of latent layers to generate for the layered output. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -617,10 +619,14 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. + prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `prompt_embeds`. negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): + Attention mask for `negative_prompt_embeds`. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py b/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py index a17c494e88eb..b3bd7b776d81 100644 --- a/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +++ b/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py @@ -746,6 +746,15 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. Can also + accept image latents as `image`, but if passing latents directly it is not encoded again. + strength (`float`, *optional*, defaults to 0.6): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. height (`int`, *optional*, defaults to self.unet.config.sample_size): diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py index c92608fad3b6..faad0fb14086 100644 --- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py +++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py @@ -403,6 +403,9 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). height (`int`, defaults to `544`): The height in pixels of the generated image. width (`int`, defaults to `960`): @@ -430,6 +433,9 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py index 7c24b898e0bb..91c09a56fcfb 100644 --- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py +++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py @@ -545,6 +545,8 @@ def __call__( image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided, image embeddings are generated from the `image` input argument. + last_image (`torch.Tensor`, *optional*): + Optional last image for image-to-video conditioning that anchors the end of the generated video. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index 6a4066eb6e17..80fe41c19d4e 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -326,7 +326,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.Tensor` or `list[torch.Tensor]`): + image_embeddings (`torch.Tensor` or `list[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index 0c5ea9ed61b4..cb339d752845 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -403,6 +403,9 @@ def __call__( Args: prompt (`str` or `list[str]`): The prompt or prompts to guide the image generation. + images (`torch.Tensor`, `PIL.Image.Image`, `list[torch.Tensor]` or `list[PIL.Image.Image]`, *optional*): + Reference image(s) used to condition the prior generation. When provided, image embeddings are derived + from the image and combined with the text prompt. height (`int`, *optional*, defaults to 1024): The height in pixels of the generated image. width (`int`, *optional*, defaults to 1024): @@ -410,6 +413,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 60): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + timesteps (`list[float]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. guidance_scale (`float`, *optional*, defaults to 8.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `decoder_guidance_scale` is defined as `w` of diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index 0f66ca909e7d..6015e7c2cc1d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -288,8 +288,10 @@ def __call__( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image` or list[`PIL.Image.Image`] or `torch.Tensor`): - `Image`, or tensor representing an image batch which will be upscaled. * + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index a3e09d1ed1ad..8cc0c2bbea70 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -929,6 +929,8 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent of the masked image (for inpainting). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index c89d593d57be..7a24e6008351 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -236,6 +236,11 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 02dc483c277a..2308b780e812 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -432,9 +432,6 @@ def __call__( negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only - applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. generator (`torch.Generator` or `list[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -442,6 +439,19 @@ def __call__( Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, pooled text embeddings are generated from the `prompt` input argument. + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt + weighting). If not provided, pooled `negative_prompt_embeds` are generated from the `negative_prompt` + input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 4befa44550b7..4dcc7fcc5718 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -568,6 +568,10 @@ def __call__( guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + noise_level (`int`, *optional*, defaults to 20): + The amount of noise to add to the upscaled input image. Must be in the range `[0, max_noise_level]` + where `max_noise_level` is defined by the scheduler. A higher `noise_level` adds more noise to the + input, increasing variation but reducing fidelity to the source image. negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index 7764a79d7faf..5c05b469660f 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -885,6 +885,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py index 7951b970cd0c..c0ab805a4ef4 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py @@ -878,6 +878,18 @@ def __call__( The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. This is set to 1024 by default for the best results. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a + list of tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or + a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + strength (`float`, *optional*, defaults to 0.6): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -940,6 +952,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py index d3594b868f89..321e9f8dd80e 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py @@ -982,9 +982,9 @@ def __call__( color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - mask_image_latent (`torch.Tensor`, `list[torch.Tensor]`): - `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask - latents tensor will be generated by `mask_image`. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent of the masked image (for inpainting). If not provided, the masked image latents are + generated from `mask_image` and `image`. height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): @@ -1064,6 +1064,9 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 2f6b105702e8..8148fac123e0 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -986,6 +986,9 @@ def __call__( as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of each denoising step during the inference. with the following arguments: `callback_on_step_end(self: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 8de7d4f0bb7d..7382d597102c 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -1141,6 +1141,8 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. + masked_image_latents (`torch.Tensor`, *optional*): + Pre-encoded latent of the masked image (for inpainting). height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. Anything below 512 pixels won't work well for @@ -1284,6 +1286,9 @@ def __call__( Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to simulate an aesthetic score of the generated image by influencing the negative text condition. + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index b79119a94a0c..bcd337414bac 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -731,14 +731,6 @@ def __call__( For most cases, `target_size` should be set to the desired height and width of the generated image. If not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - aesthetic_score (`float`, *optional*, defaults to 6.0): - Used to simulate an aesthetic score of the generated image by influencing the positive text condition. - Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_aesthetic_score (`float`, *optional*, defaults to 2.5): - Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to - simulate an aesthetic score of the generated image by influencing the negative text condition. Examples: diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py index 6cbe6d85de78..be2d53f17932 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan.py +++ b/src/diffusers/pipelines/wan/pipeline_wan.py @@ -442,6 +442,9 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index f669e9b1d0ec..8061f67ab6b9 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -537,6 +537,9 @@ def __call__( Args: image (`PipelineImageInput`): The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`. + last_image (`torch.Tensor`, *optional*): + Optional last frame to condition the generated video on. When provided, the model interpolates between + `image` (first frame) and `last_image` (last frame). prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py index c016eec1b535..b0896d382d67 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py @@ -777,6 +777,9 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/wan/pipeline_wan_video2video.py b/src/diffusers/pipelines/wan/pipeline_wan_video2video.py index 3d7c5297f4c4..8993475a2851 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_video2video.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_video2video.py @@ -505,18 +505,27 @@ def __call__( The call function to the pipeline for generation. Args: + video (`list[PIL.Image.Image]`): + The input video used as the starting point for video-to-video generation. The video should be provided + as a list of PIL images, a numpy array, or a torch tensor. prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` instead. + negative_prompt (`str` or `list[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). height (`int`, defaults to `480`): The height in pixels of the generated image. width (`int`, defaults to `832`): The width in pixels of the generated image. - num_frames (`int`, defaults to `81`): - The number of frames in the generated video. num_inference_steps (`int`, defaults to `50`): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + timesteps (`list[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. guidance_scale (`float`, defaults to `5.0`): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. @@ -537,6 +546,9 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py index 1e49737bb5b0..d64999138af7 100644 --- a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py +++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py @@ -430,6 +430,14 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + control_image (`PipelineImageInput`): + The ControlNet input condition to provide guidance to the `transformer` for generation. If the type is + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `control_image`'s dimensions. If height + and/or width are passed, `control_image` is resized accordingly. + controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 0.75): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `transformer`. cfg_normalization (`bool`, *optional*, defaults to False): Whether to apply configuration normalization. cfg_truncation (`float`, *optional*, defaults to 1.0): diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py index 09f9b2395458..40f368f0d070 100644 --- a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py +++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py @@ -439,6 +439,19 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + image (`PipelineImageInput`): + `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to + be masked out with `mask_image` and repainted according to `prompt`). + mask_image (`PipelineImageInput`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. + control_image (`PipelineImageInput`): + The ControlNet input condition to provide guidance to the `transformer` for generation. If the type is + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. + controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 0.75): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `transformer`. cfg_normalization (`bool`, *optional*, defaults to False): Whether to apply configuration normalization. cfg_truncation (`float`, *optional*, defaults to 1.0): diff --git a/utils/check_forward_call_docstrings.py b/utils/check_forward_call_docstrings.py new file mode 100644 index 000000000000..b4679f33bcda --- /dev/null +++ b/utils/check_forward_call_docstrings.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright 2026 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Check that arguments of ``forward()`` (for models) and ``__call__()`` (for +pipelines) match the method's docstring exactly: + +* every signature argument has an entry in the ``Args:`` / + ``Arguments:`` / ``Parameters:`` section, and +* every documented argument still exists in the signature + (stale entries from removed/renamed args are flagged). + +A "main" class is detected via its base classes — models inherit from +``ModelMixin`` and pipelines inherit from ``DiffusionPipeline``. Only methods +defined directly on the class are checked; inherited methods are checked when +the parent class is visited. + +Run from the repository root: + + python utils/check_forward_call_docstrings.py + +Optionally restrict to specific files: + + python utils/check_forward_call_docstrings.py --paths src/diffusers/models/transformers/transformer_flux.py +""" + +from __future__ import annotations + +import argparse +import ast +import re +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +MODELS_DIR = REPO_ROOT / "src" / "diffusers" / "models" +PIPELINES_DIR = REPO_ROOT / "src" / "diffusers" / "pipelines" + +MODEL_BASE = "ModelMixin" +PIPELINE_BASE = "DiffusionPipeline" + +SECTION_HEADERS = { + "Args:", + "Arguments:", + "Parameters:", + "Returns:", + "Return:", + "Yields:", + "Raises:", + "Examples:", + "Example:", + "Note:", + "Notes:", + "References:", + "See Also:", +} + +# `name (...)` or `name:` at the start of a (stripped) line. +_ARG_HEADER_RE = re.compile(r"^([A-Za-z_]\w*)\s*[(:]") + +# Pairs of (class_name, method_name) whose missing-arg errors should be +# suppressed. Use sparingly — prefer fixing the docstring. +IGNORE: set[tuple[str, str]] = set() + + +def _base_class_names(class_def: ast.ClassDef) -> set[str]: + """Return the textual names of base classes (best-effort).""" + names: set[str] = set() + for base in class_def.bases: + if isinstance(base, ast.Name): + names.add(base.id) + elif isinstance(base, ast.Attribute): + names.add(base.attr) + return names + + +def _find_method(class_def: ast.ClassDef, method_name: str) -> ast.FunctionDef | ast.AsyncFunctionDef | None: + for node in class_def.body: + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == method_name: + return node + return None + + +def _signature_arg_names(func: ast.FunctionDef | ast.AsyncFunctionDef) -> list[str]: + args = func.args + collected: list[str] = [] + for a in (*args.posonlyargs, *args.args, *args.kwonlyargs): + if a.arg == "self" or a.arg == "cls": + continue + collected.append(a.arg) + return collected + + +def _extract_documented_args(docstring: str | None) -> set[str]: + """Extract argument names listed in an Args/Arguments/Parameters section. + + Assumes the docstring has been cleaned (``inspect.cleandoc`` / ``ast.get_docstring``). + The section ends at the next blank-line-followed-by-section-header or at the + end of the docstring. + """ + if not docstring: + return set() + + lines = docstring.splitlines() + + # Locate the Args/Arguments/Parameters header. + start = None + header_indent = 0 + for i, line in enumerate(lines): + stripped = line.strip() + if stripped in {"Args:", "Arguments:", "Parameters:"}: + start = i + 1 + header_indent = len(line) - len(line.lstrip()) + break + if start is None: + return set() + + # First non-empty line after the header sets the per-entry indent level. + entry_indent: int | None = None + documented: set[str] = set() + + for line in lines[start:]: + stripped = line.strip() + if not stripped: + continue + indent = len(line) - len(line.lstrip()) + + # A new section at the same (or shallower) indent ends the args block. + if indent <= header_indent and stripped in SECTION_HEADERS: + break + + if entry_indent is None: + entry_indent = indent + + # Only lines at the entry indent are candidate arg headers; deeper + # indents are descriptions/continuations. + if indent != entry_indent: + continue + + match = _ARG_HEADER_RE.match(stripped) + if match: + documented.add(match.group(1)) + + return documented + + +def check_file(path: Path, kind: str) -> list[str]: + """Return a list of human-readable error strings for ``path``.""" + method_name = "forward" if kind == "model" else "__call__" + base_class = MODEL_BASE if kind == "model" else PIPELINE_BASE + + try: + tree = ast.parse(path.read_text(encoding="utf-8")) + except (SyntaxError, UnicodeDecodeError): + return [] + + errors: list[str] = [] + rel = path.relative_to(REPO_ROOT) + + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef): + continue + if base_class not in _base_class_names(node): + continue + if (node.name, method_name) in IGNORE: + continue + method = _find_method(node, method_name) + if method is None: + continue + sig_args = _signature_arg_names(method) + if not sig_args: + continue + sig_set = set(sig_args) + documented = _extract_documented_args(ast.get_docstring(method)) + missing = [a for a in sig_args if a not in documented] + stale = sorted(documented - sig_set) + if missing: + errors.append( + f"{rel}:{method.lineno}: {node.name}.{method_name} is missing " + f"docstring entries for: {', '.join(missing)}" + ) + if stale: + errors.append( + f"{rel}:{method.lineno}: {node.name}.{method_name} documents " + f"argument(s) not in the signature: {', '.join(stale)}" + ) + return errors + + +def _kind_for_path(path: Path) -> str | None: + parts = path.resolve().parts + if "pipelines" in parts: + return "pipeline" + if "models" in parts: + return "model" + return None + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--paths", + nargs="+", + help="Specific files to check (defaults to all of src/diffusers/{models,pipelines}).", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help=( + "Debug helper: when --paths is not given, only check the first N files " + "(in sorted order) from each of models/ and pipelines/." + ), + ) + args = parser.parse_args() + + targets: list[tuple[Path, str]] = [] + if args.paths: + for raw in args.paths: + p = Path(raw).resolve() + kind = _kind_for_path(p) + if kind is None: + print(f"Skipping {raw}: not under models/ or pipelines/.", file=sys.stderr) + continue + targets.append((p, kind)) + else: + model_files = sorted(MODELS_DIR.rglob("*.py")) + pipeline_files = sorted(PIPELINES_DIR.rglob("*.py")) + if args.limit is not None: + if args.limit < 0: + parser.error("--limit must be non-negative") + model_files = model_files[: args.limit] + pipeline_files = pipeline_files[: args.limit] + print( + f"--limit {args.limit}: checking {len(model_files)} model file(s) " + f"and {len(pipeline_files)} pipeline file(s).", + file=sys.stderr, + ) + for p in model_files: + targets.append((p, "model")) + for p in pipeline_files: + targets.append((p, "pipeline")) + + all_errors: list[str] = [] + for path, kind in targets: + all_errors.extend(check_file(path, kind)) + + if all_errors: + print("\n".join(all_errors)) + print( + f"\nFound {len(all_errors)} docstring/signature mismatch(es).", + file=sys.stderr, + ) + return 1 + + print("All forward/__call__ arguments are documented.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())