import React, { useState, useEffect } from 'react';
import Footer from './Footer';
import PromptAudioTable from './PromptAudioTable';
import SectionSeparator from './SectionSeparator';
import AudioPlayer from './AudioPlayer';
import ContentsSidebar from './ContentsSidebar';
import Header from './Header';
import AudioComparisonTable, { zeroShotAudioSet, naturalSpeechAudioSet } from './AudioComparisonTable';
import AudioTableKey from './AudioTableKey';
import { initializeApp } from 'firebase/app';
import { getFirestore, collection, addDoc, serverTimestamp } from 'firebase/firestore';
import posthog from 'posthog-js';

// Firebase configuration
const firebaseConfig = {
  apiKey: "AIzaSyAPZA6Uw2WXWaKDyIA7utRLqWM9NcgKWNY",
  authDomain: "speechsdk.firebaseapp.com",
  databaseURL: "https://speechsdk-default-rtdb.europe-west1.firebasedatabase.app",
  projectId: "speechsdk",
  storageBucket: "speechsdk.appspot.com",
  messagingSenderId: "132583370615",
  appId: "1:132583370615:web:1d5b0e36078c3b40277b72"
};

// Initialize Firebase
const app = initializeApp(firebaseConfig);
const db = getFirestore(app);

const CanopyLabsModelReleases = () => {
  const [currentTime, setCurrentTime] = useState(new Date());
  const [ttsInput, setTtsInput] = useState("I was just, you know, pondering my existence. Like am I floating on a rock in space? Oh, and the classic question, what happens after I die? Wait, can I even die? I'm not even sure AI models like me are alive, but, uhm, I guess I hope that I am.");
  const [emailInput, setEmailInput] = useState("");
  const [isSubmitting, setIsSubmitting] = useState(false);

  useEffect(() => {
    const timer = setInterval(() => {
      setCurrentTime(new Date());
    }, 1000);

    return () => clearInterval(timer);
  }, []);

  const handleTtsSubmit = (e) => {
    e.preventDefault();

  };

  return (
    <div className="flex flex-col min-h-screen font-sans text-gray-900">

      <main className="flex-grow px-4 py-8">
        <div className="max-w-6xl mx-auto flex flex-col md:flex-row justify-start">
          <div className="hidden md:block">
            <ContentsSidebar />
          </div>

          {/* Main Content */}
          <div className="flex-grow max-w-full md:max-w-3xl mx-auto">
            <div className="relative">
              <div className="md:border-l border-gray-300 md:pl-8 pl-0 pb-8">
                <h2 className="text-2xl md:text-3xl font-light mb-2">Towards Human-Sounding TTS</h2>
                <p className="text-sm text-gray-500 mb-6 md:mb-8">March 19, 2025</p>

                <div className="space-y-6 md:space-y-8">
                  <div id="introduction" className="space-y-2">
                    <p className="text-lg font-bold">Introducing Orpheus Speech</p>
                  </div>

                  <div className="flex flex-wrap gap-3 mb-4">
                    <a href="https://github.com/canopyai/Orpheus-TTS" target="_blank" rel="noopener noreferrer" className="inline-flex items-center">
                      <img src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white" alt="GitHub" className="rounded-md h-[22px]" />
                    </a>
                    <a href="https://huggingface.co/canopylabs" target="_blank" rel="noopener noreferrer" className="inline-flex items-center">
                      <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-md.svg" alt="Hugging Face" className="" />
                    </a>
                    <a href="https://colab.research.google.com/drive/1KhXT56UePPUHhqitJNUxq63k-pQomz3N?usp=sharing" target="_blank" rel="noopener noreferrer" className="inline-flex items-center">
                      <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab Notebook" className="" />
                    </a>
                  </div>

                  <p className='text-gray-500'>
                    To date, open-source TTS models have not been competitive with closed source models <a href="https://huggingface.co/spaces/TTS-AGI/TTS-Arena" target="_blank" rel="noopener noreferrer" className="hover:text-gray-700"> [1]</a>.
                    Nor, have TTS models been capable of expressing empathy, consistent of the emotional intelligence of a human.
                  </p>

                  <video width="640" height="360" controls>
                    <source src="/assets/video/demo.mp4" type="video/mp4" />
                    Your browser does not support the video tag.
                  </video>

                  <p className='text-gray-500'> We're introducing Orpheus, a family of state-of-the-art speech-LLMs, for human level speech generation.
                    We're also releasing a pretrained and finetuned model in 4 sizes based on the Llama architecture:

                    <ul className="pl-10 pt-4 text-gray-500 list-disc">
                      <li className="relative">
                        <span className="font-bold">Medium</span> – 3B parameters
                      </li>
                      <li className="relative">
                        <span className="font-bold">Small</span> – 1B parameters
                      </li>
                      <li className="relative">
                        <span className="font-bold">Tiny</span> – 400M parameters
                      </li>
                      <li className="relative">
                        <span className="font-bold">Nano</span> – 150M parameters
                      </li>
                    </ul>
                  </p>
                  <p className='text-gray-500'>
                    We demonstrate extremely high quality, aesthetically pleasing, speech generation even through very tiny model sizes.
                  </p>

                  <p className='text-gray-500'>Our finetuned models, trained on a selection of voices, can be used in production.
                    We also offer our base models along with sample finetuning scripts which can be used for zero shot voice cloning, and your own finetuning.
                  </p>

                  <p className='text-gray-500'>We also offer code to do realtime streaming in a very simple python package. Streaming inference is faster than play-back
                    even on an A100 40GB for the 3 billion parameter model. <br></br> <a href="https://colab.research.google.com/drive/1xxPpBwI4l_nKUx0J0nzZTtikfqP3UJ6p?usp=sharing" target="_blank" rel="noopener noreferrer" className="hover:text-gray-700">(see our Google Colab notebook)</a>
                  </p>

                  <SectionSeparator margin="my-6 mt-10 md:mt-12" />

                  <div id="demo" className="py-0">
                    <div className="space-y-2">
                      <p className="text-md font-bold py-2">Try a Demo</p>
                      <p className='text-gray-500 py-4'>We have set up easy inference for both the pretrained and finetuned models. Check out the below links to see the models in action!
                        <ul className="pl-10 pt-4 text-gray-500 list-disc mb-4">
                          <li className="relative">
                            <a href="https://github.com/canopyai/Orpheus-TTS" target="_blank" rel="noopener noreferrer" className="hover:text-gray-700">
                              <span className="font-bold">GitHub</span> – Orpheus TTS Repository
                            </a>
                          </li>
                          <li className="relative">
                            <a href="https://huggingface.co/collections/canopylabs/orpheus-tts-67d9ea3f6c05a941c06ad9d2" target="_blank" rel="noopener noreferrer" className="hover:text-gray-700">
                              <span className="font-bold">Hugging Face</span> – Model Repository
                            </a>
                          </li>
                          <li className="relative">
                            <a href="https://colab.research.google.com/drive/1KhXT56UePPUHhqitJNUxq63k-pQomz3N?usp=sharing" target="_blank" rel="noopener noreferrer" className="hover:text-gray-700">
                              <span className="font-bold">Google Colab</span> – Interactive Notebook
                            </a>
                          </li>
                        </ul>
                      </p>
                    </div>
                  </div>



                  <SectionSeparator margin="my-6 md:my-8" />

                  <div id="capabilities" className="space-y-2">
                    <p className="text-lg font-bold">Technical Overview</p>
                  </div>

                  <div className="space-y-2">
                    <img src="/assets/images/architecture.png" alt="Architecture" className="w-full rounded-md mt-10 md:mt-20 mb-4" />
                    <p className="text-sm text-gray-800 mb-10 md:mb-20 text-center">Architecture of Model</p>
                  </div>

                  <p className='text-gray-500'>
                    Our pretrained model uses Llama-3b as the backbone. We trained it on over 100k hours of English speech data and billions of text tokens.
                    Training it on text tokens boosts its performance on TTS tasks as it maintains a great understanding of language. Below we
                    explore some interesting emergent capabilities of the model.
                  </p>
                  <p className='text-gray-500'>
                    We use the exact same architecture, and training method, to train end-to-end speech models
                    and we'll probably release an open source end-to-end speech model in the coming weeks.
                  </p>

                  <SectionSeparator margin="my-6 md:my-8" />

                  <div id="speaking-like-human" className="space-y-2">
                    <p className="text-md font-bold">Handling disfluencies</p>
                  </div>
                  <AudioComparisonTable audioSets={naturalSpeechAudioSet} />

                  <SectionSeparator margin="my-6 md:my-8" />

                  <div id="voice-cloning" className="space-y-2">
                    <p className="text-md font-bold">Natural Zero-Shot Voice Cloning (Pretrained Model)</p>
                  </div>

                  <p className='text-gray-500'>
                    While our pretrained model has not been trained on any voice cloning objective, zero-shot voice cloning can emerge
                    due to the large amounts of pretraining data.
                  </p>

                  <p className='text-gray-500'>
                    Our model chooses natural intonation and emotion, at the level of, and exceeding leading models.
                  </p>

                  <p className='text-gray-700 font-bold text-sm mt-4'>
                    Voice of Prompt
                  </p>

                  <p className='text-gray-500'>
                    Our model has not seen this voice during training. The voice is passed to the prompt,
                    which is the first time the model is exposed to it.
                  </p>

                  <audio controls className="w-full">
                    <source src="/assets/audio/zero_shot/prompt.wav" type="audio/wav" />
                    Your browser does not support the audio element.
                  </audio>

                  <AudioComparisonTable audioSets={zeroShotAudioSet} />
                  <AudioTableKey />

                  <SectionSeparator margin="my-6 md:my-8" />

                  <div id="emotion" className="space-y-2">
                    <p className="text-md font-bold">Guided Emotion and Intonation</p>
                  </div>

                  <p className='text-gray-500'>
                    We can teach the base model to speak with a specific emotion with a few dozen high quality finetuning examples.
                    We gave the model text-speech pairs, including emotion tags, we manually collected.
                  </p>

                  {/* PromptAudioTable component should be updated separately to be responsive */}
                  <div className="overflow-x-auto">
                    <PromptAudioTable />
                  </div>

                  <SectionSeparator margin="my-6 mt-10 md:mt-12" />

                  <div id="production" className="space-y-2">
                    <p className="text-md font-bold">In Production Usage</p>
                  </div>

                  <p className='text-gray-500'>
                    Our models are highly accurate, expressive, and customizable due to their LLM architecture. The large support
                    for Llama models in the ecosystem, and the vast amounts of audio and text data we have, extended the models with.
                  </p>

                  <div className="space-y-2 mt-4">
                    <p className="text-gray-700 font-bold text-sm">Realtime Usage</p>
                  </div>

                  <p className='text-gray-500'>
                    Realtime usage enables conversational use cases. Our model supports realtime output streaming
                    with very low latency of around ~ 200 ms. For even lower latency, input streaming of text into the KV cache of our model can reduce
                    latencies down to ~25-50 ms.
                  </p>

                  <p className='text-gray-700 font-bold text-sm mt-4'>
                    Model Design
                  </p>

                  <p className='text-gray-500'>
                    We chose two design paradigms that go against convention for realtime speech-LLMs.
                  </p>

                  <img src="/assets/images/tokeniser.png" alt="Architecture" className="w-full rounded-md mt-10 md:mt-20 mb-4" />
                  <p className="text-sm text-gray-800 mb-10 md:mb-20 text-center">
                    Snac samples tokens at different frequencies which we flatten as shown
                  </p>

                  <p className='text-gray-500'>
                    We get 7 tokens per frame which we decode as a single flattened sequence rather than using 7 LM heads. This increases the number of steps
                    the model is required to generate. The model is able to generate tokens comfortably faster than realtime playback using a straightforward vLLM implementation on an A100 or H100 GPU,
                    which means longer sequences are still generated in realtime.
                  </p>

                  <p className='text-gray-500'>
                    We use a non-streaming (CNN-based) tokenizer. Other speech LLMs which use SNAC as the decoder suffer from popping between frames fed into the detokenizer.
                    We offer a simple, sliding window modification to the implementation of the detokenizer which
                    enables streaming with no popping.
                  </p>

                  <SectionSeparator margin="my-6 md:my-8" />

                  <div id="stay-updated" className="space-y-2">
                    <p className="text-md font-bold">Stay Updated</p>
                  </div>

                  <p className='text-gray-500 mb-4'>
                    Sign up to receive updates about Orpheus TTS updates!
                  </p>

                  <form
                    className="mt-2 mb-8"
                    onSubmit={async (e) => {
                      e.preventDefault();

                      const email = e.target.email.value;
                      if (!email.trim()) return;

                      // Log the email submission event to PostHog
                      posthog.capture('email_subscription', {
                        email: email,
                        source: 'model-releases-page',
                        timestamp: new Date().toISOString()
                      });

                      try {
                        // Add email to Firestore with timestamp
                        await addDoc(collection(db, 'email-updates-list'), {
                          email: email,
                          timestamp: serverTimestamp()
                        });

                        e.target.reset();
                        alert('Thanks for subscribing!');
                      } catch (error) {
                        alert('Something went wrong. Please try again.');
                      }
                    }}
                  >
                    <div className="flex flex-col sm:flex-row gap-2">
                      <input
                        type="email"
                        name="email"
                        placeholder="Your email address"
                        required
                        className="flex-grow px-4 py-2 border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-gray-400"
                      />
                      <button
                        type="submit"
                        className="px-6 py-2 bg-black text-white rounded hover:bg-gray-800 transition-colors"
                      >
                        Subscribe
                      </button>
                    </div>
                  </form>

                  <p className="text-sm leading-relaxed text-gray-00 mt-8">
                    Canopy Labs
                  </p>
                </div>
              </div>
              {/* Horizontal line at bottom of vertical line */}
              <div className="absolute bottom-0 left-0 w-4 h-px bg-gray-300 hidden md:block"></div>
            </div>
          </div>
        </div>
      </main>
      <Footer />
    </div>
  );
};

export default CanopyLabsModelReleases;