Skip to content

Support encoding to file-like object #754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4fd3c85
WIP
NicolasHug Jul 5, 2025
8318fbe
Add tests
NicolasHug Jul 5, 2025
10cdd5b
Linter
NicolasHug Jul 5, 2025
67962d8
Renaming
NicolasHug Jul 5, 2025
78276a2
Add tests
NicolasHug Jul 5, 2025
aa10ed1
Avoid depending on numpy for bytes conversion
NicolasHug Jul 5, 2025
dfa5bcb
Use string_view
NicolasHug Jul 5, 2025
6adb7dc
make shape a vec
NicolasHug Jul 5, 2025
8951870
dataptr is int64_t
NicolasHug Jul 5, 2025
9b6d9ee
lifetime management
NicolasHug Jul 5, 2025
b3fc714
Add contiguity check
NicolasHug Jul 5, 2025
a78ef8b
refac
NicolasHug Jul 6, 2025
fb6e463
refac
NicolasHug Jul 6, 2025
6e88ee6
mend
NicolasHug Jul 6, 2025
1c6fad8
WIP
NicolasHug Jul 6, 2025
eb1b51d
WIP
NicolasHug Jul 6, 2025
4d82cbb
bypass pybind warning
NicolasHug Jul 6, 2025
88b335a
Simplify some stuff
NicolasHug Jul 6, 2025
843ff79
Add comment
NicolasHug Jul 6, 2025
558c8f7
Fix timeout
NicolasHug Jul 6, 2025
e2a60cb
Merge branch 'main' of github.com:pytorch/torchcodec into encoding-fi…
NicolasHug Aug 4, 2025
f78eada
Fixes
NicolasHug Aug 4, 2025
f10ec36
Use dynamic cast to AVIOToTensorContext
NicolasHug Aug 4, 2025
243c960
Remove __attribute__ visibility, not needed anymore
NicolasHug Aug 4, 2025
81c430b
Merge branch 'main' of github.com:pytorch/torchcodec into encoding-fi…
NicolasHug Aug 8, 2025
7bcf8ac
Remove unnecessary m.def text
NicolasHug Aug 8, 2025
691d14b
Use wav instead of flac
NicolasHug Aug 8, 2025
bfcb314
Merge branch 'main' of github.com:pytorch/torchcodec into encoding-fi…
NicolasHug Aug 13, 2025
cc26943
Add nullptr check after dynamic_cast
NicolasHug Aug 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/torchcodec/_core/AVIOContextHolder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ void AVIOContextHolder::createAVIOContext(
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
bool isForWriting,
int bufferSize) {
TORCH_CHECK(
bufferSize > 0,
Expand All @@ -23,14 +24,18 @@ void AVIOContextHolder::createAVIOContext(
buffer != nullptr,
"Failed to allocate buffer of size " + std::to_string(bufferSize));

TORCH_CHECK(
(seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
"seek method must be defined, and either write or read must be defined. "
"But not both!")
TORCH_CHECK(seek != nullptr, "seek method must be defined");

if (isForWriting) {
TORCH_CHECK(write != nullptr, "write method must be defined for writing");
} else {
TORCH_CHECK(read != nullptr, "read method must be defined for reading");
}

avioContext_.reset(avioAllocContext(
buffer,
bufferSize,
/*write_flag=*/write != nullptr,
/*write_flag=*/isForWriting,
heldData,
read,
write,
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/AVIOContextHolder.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class AVIOContextHolder {
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
bool isForWriting,
int bufferSize = defaultBufferSize);

private:
Expand Down
26 changes: 21 additions & 5 deletions src/torchcodec/_core/AVIOFileLikeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,29 @@

namespace facebook::torchcodec {

AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike, bool isForWriting)
: fileLike_{UniquePyObject(new py::object(fileLike))} {
{
// TODO: Is it necessary to acquire the GIL here? Is it maybe even
// harmful? At the moment, this is only called from within a pybind
// function, and pybind guarantees we have the GIL.
py::gil_scoped_acquire gil;
TORCH_CHECK(
py::hasattr(fileLike, "read"),
"File like object must implement a read method.");

if (isForWriting) {
TORCH_CHECK(
py::hasattr(fileLike, "write"),
"File like object must implement a write method for writing.");
} else {
TORCH_CHECK(
py::hasattr(fileLike, "read"),
"File like object must implement a read method for reading.");
}

TORCH_CHECK(
py::hasattr(fileLike, "seek"),
"File like object must implement a seek method.");
}
createAVIOContext(&read, nullptr, &seek, &fileLike_);
createAVIOContext(&read, &write, &seek, &fileLike_, isForWriting);
}

int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
Expand Down Expand Up @@ -77,4 +85,12 @@ int64_t AVIOFileLikeContext::seek(void* opaque, int64_t offset, int whence) {
return py::cast<int64_t>((*fileLike)->attr("seek")(offset, whence));
}

int AVIOFileLikeContext::write(void* opaque, const uint8_t* buf, int buf_size) {
auto fileLike = static_cast<UniquePyObject*>(opaque);
py::gil_scoped_acquire gil;
py::bytes bytes_obj(reinterpret_cast<const char*>(buf), buf_size);

return py::cast<int64_t>((*fileLike)->attr("write")(bytes_obj));
}

} // namespace facebook::torchcodec
3 changes: 2 additions & 1 deletion src/torchcodec/_core/AVIOFileLikeContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ namespace facebook::torchcodec {
// and seek calls back up to the methods on the Python object.
class AVIOFileLikeContext : public AVIOContextHolder {
public:
explicit AVIOFileLikeContext(py::object fileLike);
explicit AVIOFileLikeContext(py::object fileLike, bool isForWriting);

private:
static int read(void* opaque, uint8_t* buf, int buf_size);
static int64_t seek(void* opaque, int64_t offset, int whence);
static int write(void* opaque, const uint8_t* buf, int buf_size);

// Note that we dynamically allocate the Python object because we need to
// strictly control when its destructor is called. We must hold the GIL
Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/AVIOTensorContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,14 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
TORCH_CHECK(data.numel() > 0, "data must not be empty");
TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
createAVIOContext(&read, nullptr, &seek, &tensorContext_);
createAVIOContext(
&read, nullptr, &seek, &tensorContext_, /*isForWriting=*/false);
}

AVIOToTensorContext::AVIOToTensorContext()
: tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
createAVIOContext(nullptr, &write, &seek, &tensorContext_);
createAVIOContext(
nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
}

torch::Tensor AVIOToTensorContext::getOutputTensor() {
Expand Down
10 changes: 7 additions & 3 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ AudioEncoder::AudioEncoder(
const torch::Tensor& samples,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::unique_ptr<AVIOContextHolder> avioContextHolder,
const AudioStreamOptions& audioStreamOptions)
: samples_(validateSamples(samples)),
inSampleRate_(sampleRate),
Expand Down Expand Up @@ -248,9 +248,12 @@ void AudioEncoder::initializeEncoder(
torch::Tensor AudioEncoder::encodeToTensor() {
TORCH_CHECK(
avioContextHolder_ != nullptr,
"Cannot encode to tensor, avio context doesn't exist.");
"Cannot encode to tensor, avio tensor context doesn't exist.");
encode();
return avioContextHolder_->getOutputTensor();
auto avioToTensorContext =
dynamic_cast<AVIOToTensorContext*>(avioContextHolder_.get());
TORCH_CHECK(avioToTensorContext != nullptr, "Invalid AVIO context holder.");
return avioToTensorContext->getOutputTensor();
}

void AudioEncoder::encode() {
Expand Down Expand Up @@ -501,6 +504,7 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
void AudioEncoder::flushBuffers() {
AutoAVPacket autoAVPacket;
maybeFlushSwrBuffers(autoAVPacket);

encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
}
} // namespace facebook::torchcodec
10 changes: 6 additions & 4 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include <torch/types.h>
#include "src/torchcodec/_core/AVIOTensorContext.h"
#include "src/torchcodec/_core/AVIOContextHolder.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/StreamOptions.h"

Expand All @@ -14,13 +14,16 @@ class AudioEncoder {
int sampleRate,
std::string_view fileName,
const AudioStreamOptions& audioStreamOptions);

AudioEncoder(
const torch::Tensor& samples,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::unique_ptr<AVIOContextHolder> avioContextHolder,
const AudioStreamOptions& audioStreamOptions);

void encode();

torch::Tensor encodeToTensor();

private:
Expand Down Expand Up @@ -49,8 +52,7 @@ class AudioEncoder {

UniqueAVAudioFifo avAudioFifo_;

// Stores the AVIOContext for the output tensor buffer.
std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
std::unique_ptr<AVIOContextHolder> avioContextHolder_;

bool encodeWasCalled_ = false;
int64_t lastEncodedAVFramePts_ = 0;
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
create_from_file_like,
create_from_tensor,
encode_audio_to_file,
encode_audio_to_file_like,
encode_audio_to_tensor,
get_ffmpeg_library_versions,
get_frame_at_index,
Expand Down
56 changes: 56 additions & 0 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,62 @@ def create_from_file_like(
return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))


def encode_audio_to_file_like(
samples: torch.Tensor,
sample_rate: int,
format: str,
file_like: Union[io.RawIOBase, io.BufferedIOBase],
bit_rate: Optional[int] = None,
num_channels: Optional[int] = None,
desired_sample_rate: Optional[int] = None,
) -> None:
"""Encode audio samples to a file-like object.

Args:
samples: Audio samples tensor
sample_rate: Sample rate in Hz
format: Audio format (e.g., "wav", "mp3", "flac")
file_like: File-like object that supports write() and seek() methods
bit_rate: Optional bit rate for encoding
num_channels: Optional number of output channels
desired_sample_rate: Optional desired sample rate for the output.
"""
assert _pybind_ops is not None

if samples.dtype != torch.float32:
raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")

# We're having the same problem as with the decoder's create_from_file_like:
# We should be able to pass a tensor directly, but this leads to a pybind
# error. In order to work around this, we pass the pointer to the tensor's
# data, and its shape, in order to re-construct it in C++. For this to work:
# - the tensor must be float32
# - the tensor must be contiguous, which is why we call contiguous().
# In theory we could avoid this restriction by also passing the strides?
# - IMPORTANT: the input samples tensor and its underlying data must be
# alive during the call.
#
# A more elegant solution would be to cast the tensor into a py::object, but
# casting the py::object backk to a tensor in C++ seems to lead to the same
# pybing error.

samples = samples.contiguous()
_pybind_ops.encode_audio_to_file_like(
samples.data_ptr(),
list(samples.shape),
sample_rate,
format,
file_like,
bit_rate,
num_channels,
desired_sample_rate,
)

# This check is useless but it's critical to keep it to ensures that samples
# is still alive during the call to encode_audio_to_file_like.
assert samples.is_contiguous()
Copy link
Member Author

@NicolasHug NicolasHug Jul 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hate that we have to do this but I do not see any other obvious way to keep the input samples alive for the duration of the call.
Claude is saying that we could just pass samples as a py::object. We won't be able to turn it back to a tensor (as mentioned in the code comment above), but claude claims that passing it as a parameter will ensure that pybind will keep it alive. I cannot verify this.

@scotts, any thoughts?

Copy link
Contributor

@scotts scotts Jul 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the keep-alive part, I believe Claude is right. If we pass something as a py::object, that gets properly reference-counted which will keep the object alive. When we launder a pointer as an int, there's no reference counting.

Of course, we would ideally just pass the tensor - but we run into problems passing tensors as tensors into the pybind11 code. The next simplest thing that we probably can't do for performance reasons is to copy the tensor into either bytes or a list, and then pass those as py::object. But since samples will be large, I don't think we want to do that.

Most workarounds I can think of are worse. One that might be just as bad, but could potentially apply to both this situation and decoder creation:

  1. On the pybind11 side, we only create the AVIOFileLikeContext. We don't create the encoder or decoder. We do still accept the file-like objects, and they are still stored in the AVIOFileLIkeContext.
  2. We return an int from the C++ side to the Python side where that int is a pointer to the AVIOFileLikeContext.
  3. On the PyTorch custom ops side, we have functions for create-from-file-like and encode-to-file-like that accept the int value and do a reinterpret_cast<AVIOFileLikeContext*> in the C++. Those are then passed to the decoder or encode.

As it is right now, we're doing a lot of ugly pointer casting with tensors. The above may actually be better, as then the pybind11 code is only really concerned with creating AVIOFIleLikeContext objects. It doesn't even need to know about encoders and decoders.



# ==============================
# Abstract impl for the operators. Needed by torch.compile.
# ==============================
Expand Down
40 changes: 39 additions & 1 deletion src/torchcodec/_core/pybind_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#include <string>

#include "src/torchcodec/_core/AVIOFileLikeContext.h"
#include "src/torchcodec/_core/Encoder.h"
#include "src/torchcodec/_core/SingleStreamDecoder.h"
#include "src/torchcodec/_core/StreamOptions.h"

namespace py = pybind11;

Expand All @@ -31,19 +33,55 @@ int64_t create_from_file_like(
realSeek = seekModeFromString(seek_mode.value());
}

auto avioContextHolder = std::make_unique<AVIOFileLikeContext>(file_like);
auto avioContextHolder =
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);

SingleStreamDecoder* decoder =
new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
return reinterpret_cast<int64_t>(decoder);
}

void encode_audio_to_file_like(
int64_t data_ptr,
const std::vector<int64_t>& shape,
int64_t sample_rate,
std::string_view format,
py::object file_like,
std::optional<int64_t> bit_rate = std::nullopt,
std::optional<int64_t> num_channels = std::nullopt,
std::optional<int64_t> desired_sample_rate = std::nullopt) {
// We assume float32 *and* contiguity, this must be enforced by the caller.
auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep this technique, we can probably allow all dtypes by passing in the dtype from the Python side as ints. I assume the Python and C++ enums agree on values, but even if they don't, we can figure out the mapping. Ugly, but possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I don't think we need to support more dtypes than just float32: the input samples that the user gives us must be float32 already. This comment is just here to explicitly state the assumptions that are made within encode_audio_to_file_like.

auto samples = torch::from_blob(
reinterpret_cast<void*>(data_ptr), shape, tensor_options);

// TODO Fix implicit int conversion:
// https://github.com/pytorch/torchcodec/issues/679
// same for sample_rate parameter below
AudioStreamOptions audioStreamOptions;
audioStreamOptions.bitRate = bit_rate;
audioStreamOptions.numChannels = num_channels;
audioStreamOptions.sampleRate = desired_sample_rate;

auto avioContextHolder =
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);

AudioEncoder encoder(
samples,
static_cast<int>(sample_rate),
format,
std::move(avioContextHolder),
audioStreamOptions);
encoder.encode();
}

#ifndef PYBIND_OPS_MODULE_NAME
#error PYBIND_OPS_MODULE_NAME must be defined!
#endif

PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
m.def("create_from_file_like", &create_from_file_like);
m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
}

} // namespace facebook::torchcodec
39 changes: 39 additions & 0 deletions src/torchcodec/encoders/_audio_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,42 @@ def to_tensor(
num_channels=num_channels,
desired_sample_rate=sample_rate,
)

def to_file_like(
self,
file_like,
format: str,
*,
bit_rate: Optional[int] = None,
num_channels: Optional[int] = None,
sample_rate: Optional[int] = None,
) -> None:
"""Encode samples into a file-like object.

Args:
file_like: A file-like object that supports ``write()`` and
``seek()`` methods, such as io.BytesIO(), an open file in binary
write mode, etc. Methods must have the following signature:
``write(data: bytes) -> int`` and ``seek(offset: int, whence:
int = 0) -> int``.
format (str): The format of the encoded samples, e.g. "mp3", "wav"
or "flac".
bit_rate (int, optional): The output bit rate. Encoders typically
support a finite set of bit rate values, so ``bit_rate`` will be
matched to one of those supported values. The default is chosen
by FFmpeg.
num_channels (int, optional): The number of channels of the encoded
output samples. By default, the number of channels of the input
``samples`` is used.
sample_rate (int, optional): The sample rate of the encoded output.
By default, the sample rate of the input ``samples`` is used.
"""
_core.encode_audio_to_file_like(
samples=self._samples,
sample_rate=self._sample_rate,
format=format,
file_like=file_like,
bit_rate=bit_rate,
num_channels=num_channels,
desired_sample_rate=sample_rate,
)
Loading
Loading