diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 23481ebe4a..dae2606eaf 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -35,7 +35,6 @@ ax-platform>=0.4.0
 nbformat>=5.9.2
 datasets
 transformers
-torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
 onnx
 onnxscript>=0.2.2
 onnxruntime
@@ -64,7 +63,6 @@ gym-super-mario-bros==7.4.0
 pyopengl
 gymnasium[mujoco]==0.27.0
 timm
-iopath
 pygame==2.6.0
 pycocotools
 semilearn==0.3.2
diff --git a/Makefile b/Makefile
index 9068d32b2a..6b61e36ec5 100644
--- a/Makefile
+++ b/Makefile
@@ -78,10 +78,6 @@ download:
 	wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR)
 	cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth
 
-	# Download vocab for beginner_source/flava_finetuning_tutorial.py
-	wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
-	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
-
 	# Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py
 	wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR)
 	unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/
diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
deleted file mode 100644
index 12e20f475f..0000000000
--- a/beginner_source/flava_finetuning_tutorial.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-TorchMultimodal Tutorial: Finetuning FLAVA
-============================================
-"""
-
-######################################################################
-# Multimodal AI has recently become very popular owing to its ubiquitous
-# nature, from use cases like image captioning and visual search to more
-# recent applications like image generation from text. **TorchMultimodal
-# is a library powered by Pytorch consisting of building blocks and end to
-# end examples, aiming to enable and accelerate research in
-# multimodality**.
-# 
-# In this tutorial, we will demonstrate how to use a **pretrained SoTA
-# model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
-# TorchMultimodal library to finetune on a multimodal task i.e. visual
-# question answering** (VQA). The model consists of two unimodal transformer
-# based encoders for text and image and a multimodal encoder to combine
-# the two embeddings. It is pretrained using contrastive, image text matching and 
-# text, image and multimodal masking losses.
-
-
-######################################################################
-# Installation
-# -----------------
-# We will use TextVQA dataset and ``bert tokenizer`` from Hugging Face for this
-# tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
-#
-# .. note::
-#
-#    When running this tutorial in Google Colab, install the required packages by
-#    creating a new cell and running the following commands:
-#
-#    .. code-block::
-#
-#       !pip install torchmultimodal-nightly
-#       !pip install datasets
-#       !pip install transformers
-#
-
-######################################################################
-# Steps
-# -----
-#
-# 1. Download the Hugging Face dataset to a directory on your computer by running the following command:
-#
-#    .. code-block::
-#
-#       wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
-#       tar xf vocab.tar.gz
-#
-#    .. note:: 
-#       If you are running this tutorial in Google Colab, run these commands
-#       in a new cell and prepend these commands with an exclamation mark (!)
-#
-#
-# 2. For this tutorial, we treat VQA as a classification task where
-#    the inputs are images and question (text) and the output is an answer class. 
-#    So we need to download the vocab file with answer classes and create the answer to
-#    label mapping.
-#
-#    We also load the `textvqa
-#    dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
-#    (images,questions and answers) from Hugging Face
-#
-# We see there are 3997 answer classes including a class representing
-# unknown answers.
-#
-
-with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
-  vocab = f.readlines()
-
-answer_to_idx = {}
-for idx, entry in enumerate(vocab):
-  answer_to_idx[entry.strip("\n")] = idx
-print(len(vocab))
-print(vocab[:5])
-
-from datasets import load_dataset
-dataset = load_dataset("textvqa")
-
-######################################################################
-# Lets display a sample entry from the dataset:
-#
-
-import matplotlib.pyplot as plt
-import numpy as np 
-idx = 5 
-print("Question: ", dataset["train"][idx]["question"]) 
-print("Answers: " ,dataset["train"][idx]["answers"])
-im = np.asarray(dataset["train"][idx]["image"].resize((500,500)))
-plt.imshow(im)
-plt.show()
-
-
-######################################################################
-# 3. Next, we write the transform function to convert the image and text into
-# Tensors consumable by our model - For images, we use the transforms from
-# torchvision to convert to Tensor and resize to uniform sizes - For text,
-# we tokenize (and pad) them using the ``BertTokenizer`` from Hugging Face -
-# For answers (i.e. labels), we take the most frequently occurring answer
-# as the label to train with:
-#
-
-import torch
-from torchvision import transforms
-from collections import defaultdict
-from transformers import BertTokenizer
-from functools import partial
-
-def transform(tokenizer, input):
-  batch = {}
-  image_transform = transforms.Compose([transforms.ToTensor(), transforms.Resize([224,224])])
-  image = image_transform(input["image"][0].convert("RGB"))
-  batch["image"] = [image]
-
-  tokenized=tokenizer(input["question"],return_tensors='pt',padding="max_length",max_length=512)
-  batch.update(tokenized)
-
-
-  ans_to_count = defaultdict(int)
-  for ans in input["answers"][0]:
-    ans_to_count[ans] += 1
-  max_value = max(ans_to_count, key=ans_to_count.get)
-  ans_idx = answer_to_idx.get(max_value,0)
-  batch["answers"] = torch.as_tensor([ans_idx])
-  return batch
-
-tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",padding="max_length",max_length=512)
-transform=partial(transform,tokenizer)
-dataset.set_transform(transform)
-
-
-######################################################################
-# 4. Finally, we import the ``flava_model_for_classification`` from
-# ``torchmultimodal``. It loads the pretrained FLAVA checkpoint by default and
-# includes a classification head.
-#
-# The model forward function passes the image through the visual encoder
-# and the question through the text encoder. The image and question
-# embeddings are then passed through the multimodal encoder. The final
-# embedding corresponding to the CLS token is passed through a MLP head
-# which finally gives the probability distribution over each possible
-# answers.
-#
-
-from torchmultimodal.models.flava.model import flava_model_for_classification
-model = flava_model_for_classification(num_classes=len(vocab))
-
-
-######################################################################
-# 5. We put together the dataset and model in a toy training loop to
-# demonstrate how to train the model for 3 iterations:
-#
-
-from torch import nn
-BATCH_SIZE = 2
-MAX_STEPS = 3
-from torch.utils.data import DataLoader
-
-train_dataloader = DataLoader(dataset["train"], batch_size= BATCH_SIZE)
-optimizer = torch.optim.AdamW(model.parameters())
-
-
-epochs = 1
-for _ in range(epochs):
-  for idx, batch in enumerate(train_dataloader):
-    optimizer.zero_grad()
-    out = model(text = batch["input_ids"], image = batch["image"], labels = batch["answers"])
-    loss = out.loss
-    loss.backward()
-    optimizer.step()
-    print(f"Loss at step {idx} = {loss}")
-    if idx >= MAX_STEPS-1:
-      break
-
-
-######################################################################
-# Conclusion
-# -------------------
-#
-# This tutorial introduced the basics around how to finetune on a
-# multimodal task using FLAVA from TorchMultimodal. Please also check out
-# other examples from the library like
-# `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__
-# which is a multimodal model for object detection and
-# `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
-# which is multitask model spanning image, video and 3d classification.
-#
diff --git a/index.rst b/index.rst
index 71cddc0bda..8ce8b249f8 100644
--- a/index.rst
+++ b/index.rst
@@ -818,15 +818,6 @@ Welcome to PyTorch Tutorials
    :link: advanced/sharding.html
    :tags: TorchRec,Recommender
 
-.. Multimodality
-
-.. customcarditem::
-   :header: Introduction to TorchMultimodal
-   :card_description: TorchMultimodal is a library that provides models, primitives and examples for training multimodal tasks
-   :image: _static/img/thumbnails/torchrec.png
-   :link: beginner/flava_finetuning_tutorial.html
-   :tags: TorchMultimodal
-
 
 .. End of tutorial card section
 
@@ -1102,11 +1093,3 @@ Additional Resources
 
    intermediate/torchrec_intro_tutorial
    advanced/sharding
-
-.. toctree::
-   :maxdepth: 2
-   :includehidden:
-   :hidden:
-   :caption: Multimodality
-
-   beginner/flava_finetuning_tutorial
diff --git a/redirects.py b/redirects.py
index 6e2d5f34c1..4e5dfceebf 100644
--- a/redirects.py
+++ b/redirects.py
@@ -3,7 +3,8 @@
     "beginner/ptcheat.html": "../index.html",
     "beginner/deploy_seq2seq_hybrid_frontend_tutorial.html": "../index.html",
     "recipes/bundled_inputs.html": "../index.html",
-    "intermediate/torchserve_with_ipex.html": "../index.html",  
+    "intermediate/torchserve_with_ipex.html": "../index.html",
     "intermediate/torchserve_with_ipex_2.html": "../index.html",
     "recipes/torchserve_vertexai_tutorial.html": "../index.html",
+    "beginner/flava_finetuning_tutorial.html": "../index.html",
 }