Spaces:

literallybannedfromcallingbob
/

Aegis-ATIS-Demo

Sleeping

App Files Files Community

literallybannedfromcallingbob commited on Jul 4, 2025

Commit

9622166

1 Parent(s): 35a6fb3

updated

Browse files

Files changed (25) hide show

.gradio/certificate.pem +31 -0
Atis_Training.py +239 -0
__pycache__/transformer_chat.cpython-312.pyc +0 -0
atis_transformer.pt +3 -0
built_transformer/__pycache__/decoders.cpython-312.pyc +0 -0
built_transformer/__pycache__/embeddings.cpython-312.pyc +0 -0
built_transformer/__pycache__/encoder.cpython-312.pyc +0 -0
built_transformer/__pycache__/encoding_layers.cpython-312.pyc +0 -0
built_transformer/__pycache__/masking_for_attention.cpython-312.pyc +0 -0
built_transformer/__pycache__/multihead_attention.cpython-312.pyc +0 -0
built_transformer/__pycache__/positional_encodings.cpython-312.pyc +0 -0
built_transformer/__pycache__/slot_classifier.cpython-312.pyc +0 -0
built_transformer/decoders.py +74 -0
built_transformer/embeddings.py +75 -0
built_transformer/encoder.py +100 -0
built_transformer/encoding_layers.py +90 -0
built_transformer/masking_for_attention.py +12 -0
built_transformer/multihead_attention.py +72 -0
built_transformer/positional_encodings.py +68 -0
built_transformer/slot_classifier.py +67 -0
built_transformer/test_linear_function.py +8 -0
non-app.py +64 -0
requirements.txt +2 -5
tokenizer.json +0 -0
transformer_chat.py +189 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Atis_Training.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from datasets import load_dataset
+from tokenizers import Tokenizer
+from torch.utils.data import DataLoader, Dataset
+import torch
+from transformer_chat import TransformerChatbot
+import pandas as pd
+import random
+# Loading atis-datasets
+raw_dataset = load_dataset("tuetschek/atis", split="train")
+# Loading tokenizer from file
+tokenizer = Tokenizer.from_file('tokenizer.json')
+# Create synthetic responses for ATIS queries for training purposes
+def create_response_for_intent(intent, text):
+    """Create synthetic responses for ATIS intents"""
+    responses = {
+        'atis_flight': [
+            "I can help you with flight information. What specific details do you need?",
+            "I'll search for flights matching your criteria. Please provide departure and arrival cities.",
+            "Let me find available flights for you. When would you like to travel?"
+        ],
+        'atis_flight_no': [
+            "I can help you with flight number information. Please provide the flight number.",
+            "Let me search for details about that flight number.",
+            "I'll look up information for that specific flight."
+        ],
+        'atis_airfare': [
+            "I can help you find airfare information. What's your travel route?",
+            "Let me search for the best airfare options for your trip.",
+            "I'll check current airfare prices for your destination."
+        ],
+        'atis_airline': [
+            "I can help you with airline information. Which airline are you looking for?",
+            "Let me provide information about that airline.",
+            "I'll search for details about the airline you mentioned."
+        ],
+        'atis_abbreviation': [
+            "I can help you with airport abbreviations. Which abbreviation do you need?",
+            "Let me explain that airport abbreviation for you.",
+            "I'll provide the full name for that airport code."
+        ],
+        'atis_airport': [
+            "I can help you with airport information. Which airport are you looking for?",
+            "Let me provide details about that airport.",
+            "I'll search for information about the airport you mentioned."
+        ],
+        'atis_distance': [
+            "I can help you calculate distances between airports. Which airports are you interested in?",
+            "Let me calculate the distance for you.",
+            "I'll provide distance information between those locations."
+        ],
+        'atis_ground_service': [
+            "I can help you with ground transportation services. What type of service do you need?",
+            "Let me find ground transportation options for you.",
+            "I'll search for available ground services at your destination."
+        ],
+        'atis_aircraft': [
+            "I can help you with aircraft information. What type of aircraft are you looking for?",
+            "Let me provide details about that aircraft type.",
+            "I'll search for information about the aircraft you mentioned."
+        ],
+        'atis_capacity': [
+            "I can help you with capacity information. What specific capacity details do you need?",
+            "Let me check the capacity for that flight or aircraft.",
+            "I'll provide capacity information for your query."
+        ],
+        'atis_quantity': [
+            "I can help you with quantity information. What specific quantity are you looking for?",
+            "Let me check the quantity for that item or service.",
+            "I'll provide quantity information for your request."
+        ],
+        'atis_meal': [
+            "I can help you with meal information. What type of meal service are you looking for?",
+            "Let me check meal options for your flight.",
+            "I'll provide information about meal services available."
+        ],
+        'atis_cheapest': [
+            "I can help you find the cheapest options. What's your travel route?",
+            "Let me search for the most affordable options for your trip.",
+            "I'll find the cheapest flights or services for you."
+        ],
+        'atis_restriction': [
+            "I can help you with travel restrictions. What type of restrictions are you asking about?",
+            "Let me check the restrictions for your travel plans.",
+            "I'll provide information about travel restrictions."
+        ],
+        'atis_day_name': [
+            "I can help you with day information. What specific day are you looking for?",
+            "Let me check the schedule for that day.",
+            "I'll provide information about flights or services on that day."
+        ]
+    }
+    # Get base responses for the intent calssification datasets
+    base_responses = responses.get(intent, [
+        "I can help you with that. Please provide more details.",
+        "Let me assist you with your request.",
+        "I'll help you find the information you need."
+    ])
+    # For variety
+    if "flight" in text.lower():
+        base_responses.extend([
+            "I can help you book a flight. What are your travel dates?",
+            "Let me search for available flights for you.",
+            "I'll help you find the best flight options."
+        ])
+    return random.choice(base_responses)
+# Create training data with question-answer pairs
+def create_training_pairs():
+    training_data = []
+    for item in raw_dataset:
+        question = item['text']
+        intent = item['intent']
+        response = create_response_for_intent(intent, question)
+        # Tokenize question and response
+        question_encoding = tokenizer.encode(question)
+        response_encoding = tokenizer.encode(response)
+        # Add the specially defined tokens
+        question_ids = [tokenizer.token_to_id("[CLS]")] + question_encoding.ids + [tokenizer.token_to_id("[SEP]")]
+        response_ids = [tokenizer.token_to_id("[CLS]")] + response_encoding.ids + [tokenizer.token_to_id("[SEP]")]
+        training_data.append({
+            'question_ids': question_ids,
+            'response_ids': response_ids,
+            'question_len': len(question_ids),
+            'response_len': len(response_ids)
+        })
+    return training_data
+# Create custom dataset for training
+class AtisGenerationDataset(Dataset):
+    def __init__(self, training_data, tokenizer, max_length=128):
+        self.training_data = training_data
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.training_data)
+    def __getitem__(self, idx):
+        item = self.training_data[idx]
+        # Pad sequences
+        question_ids = item['question_ids'][:self.max_length//2]
+        response_ids = item['response_ids'][:self.max_length//2]
+        # Pad with PAD token
+        question_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(question_ids))
+        response_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(response_ids))
+        return (
+            torch.tensor(question_ids),
+            torch.tensor(response_ids),
+            torch.tensor(item['question_len']),
+            torch.tensor(item['response_len'])
+        )
+# Create training data
+print("Creating training data...")
+training_data = create_training_pairs()
+print(f"Created {len(training_data)} training pairs")
+# Prepare DataLoader
+atis_dataset = AtisGenerationDataset(training_data, tokenizer)
+dataloader = DataLoader(atis_dataset, batch_size=16, shuffle=True)
+# Prepare model with all the neccessary parameters
+vocab_size = tokenizer.get_vocab_size()
+model = TransformerChatbot(
+    vocab_size=vocab_size,
+    d_model=512,
+    num_heads=8,
+    d_ff=2048,
+    num_encoder_layers=6,
+    num_decoder_layers=6,
+    num_roles=2,
+    max_turns=16,
+    num_slots=len(set(item['intent'] for item in raw_dataset)),
+    dropout=0.1
+)
+# Using gpu - cuda for training
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+# Training loop for generation
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))
+print("Starting training...")
+for epoch in range(10):  # 10 epochs for fast training
+    model.train()
+    total_loss = 0
+    for batch_idx, (question_ids, response_ids, question_lens, response_lens) in enumerate(dataloader):
+        question_ids = question_ids.to(device)
+        response_ids = response_ids.to(device)
+        batch_size, seq_len = question_ids.shape
+        # Dummy roles and turns
+        roles = torch.zeros_like(question_ids)
+        turns = torch.zeros_like(question_ids)
+        # Forward pass
+        gen_logits, slot_logits = model(
+            question_ids, response_ids,
+            roles, roles,
+            turns, turns
+        )
+        # Calculate loss for generation (teacher forcing)
+        target_ids = response_ids[:, 1:]  # Remove [CLS] token
+        gen_logits = gen_logits[:, :-1, :]  # Remove last position
+        # Flatten for loss calculation
+        gen_logits_flat = gen_logits.reshape(-1, vocab_size)
+        target_ids_flat = target_ids.reshape(-1)
+        loss = loss_fn(gen_logits_flat, target_ids_flat)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+        if batch_idx % 100 == 0:
+            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
+    # Averaging the losses
+    avg_loss = total_loss / len(dataloader)
+    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
+# Save model
+print("Saving model...")
+torch.save(model.state_dict(), 'atis_transformer.pt')
+print("Training completed!")

__pycache__/transformer_chat.cpython-312.pyc ADDED Viewed

Binary file (6.46 kB). View file

atis_transformer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7972c60cfa6e05b74247ec320cd7f7b46871f8ee8315ea14d69cc20190b459f
+size 412902527

built_transformer/__pycache__/decoders.cpython-312.pyc ADDED Viewed

Binary file (4.75 kB). View file

built_transformer/__pycache__/embeddings.cpython-312.pyc ADDED Viewed

Binary file (3.46 kB). View file

built_transformer/__pycache__/encoder.cpython-312.pyc ADDED Viewed

Binary file (5.52 kB). View file

built_transformer/__pycache__/encoding_layers.cpython-312.pyc ADDED Viewed

Binary file (4.33 kB). View file

built_transformer/__pycache__/masking_for_attention.cpython-312.pyc ADDED Viewed

Binary file (885 Bytes). View file

built_transformer/__pycache__/multihead_attention.cpython-312.pyc ADDED Viewed

Binary file (3.87 kB). View file

built_transformer/__pycache__/positional_encodings.cpython-312.pyc ADDED Viewed

Binary file (3.15 kB). View file

built_transformer/__pycache__/slot_classifier.cpython-312.pyc ADDED Viewed

Binary file (3.33 kB). View file

built_transformer/decoders.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+import math
+from .positional_encodings import PositionalEncoding
+from .encoding_layers import position_wide_feed_forward
+class DecoderLayer(nn.Module):
+    def __init__(self, dimension_for_model, num_of_heads, dim_feedforward=2048, dropout=0.1):
+        '''
+        dimension_for_model: the desired dimension of model as specified from the embeddings layer
+        num_of_heads: the desired number of heads wanted from the multi-head-attention mechanism, also specified within encoders
+        dim_feedforward: the dimension for the feedforward module, defaulted to 2048
+        dropout: mechanism to remove model dependencies on other factors, defaulted to 0.1
+        '''
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Masked self - attention
+        self.cross_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Encoder decoder attention
+        self.ffn = nn.Sequential(
+            nn.Linear(dimension_for_model, dim_feedforward), # Feeding forward
+            nn.ReLU(),
+            nn.Linear(dim_feedforward, dimension_for_model),
+        )
+        # Layer normalizations
+        self.norm1 = nn.LayerNorm(dimension_for_model)
+        self.norm2 = nn.LayerNorm(dimension_for_model)
+        self.norm3 = nn.LayerNorm(dimension_for_model)
+        # Dropouts
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
+        tgt_t = tgt.transpose(0, 1)
+        memory_t = memory.transpose(0, 1)
+        # Masked self-attention
+        _tgt = tgt
+        tgt2, _ = self.self_attn(tgt_t, tgt_t, tgt_t, attn_mask=tgt_mask)
+        tgt2 = tgt2.transpose(0, 1)  # Back to [batch_size, seq_len, hidden_dim]
+        tgt = self.norm1(_tgt + self.dropout1(tgt2))
+        # Cross-attention with encoder output
+        _tgt = tgt
+        tgt2, _ = self.cross_attn(tgt_t, memory_t, memory_t, attn_mask=memory_mask)
+        tgt2 = tgt2.transpose(0, 1)  # Back to [batch_size, seq_len, hidden_dim]
+        tgt = self.norm2(_tgt + self.dropout2(tgt2))
+        # Feed-forward
+        _tgt = tgt
+        tgt2 = self.ffn(tgt)
+        tgt = self.norm3(_tgt + self.dropout3(tgt2))
+        return tgt
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, dimension_for_model, num_layers, num_of_heads, dim_feedforward=2048, dropout=0.1, max_len=5000):
+        super().__init__()
+        self.embed = nn.Embedding(vocab_size, dimension_for_model) # Embeds the data
+        self.pe    = PositionalEncoding(dimension_for_model, dropout=dropout, max_len=max_len)  # Encodes using sine and cosine functions for different positions
+        self.layers = nn.ModuleList([
+            DecoderLayer(dimension_for_model, num_of_heads, dim_feedforward, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(dimension_for_model)
+    def forward(self, tgt_seq, memory, tgt_mask=None, memory_mask=None):
+        x = self.embed(tgt_seq) * math.sqrt(self.embed.embedding_dim) # Embedding and masking
+        x = self.pe(x)
+        for layer in self.layers:  # Iterating through encoding layers
+            x = layer(x, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        return self.norm(x) # Layer normalization

built_transformer/embeddings.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import math
+import torch.nn as nn
+# Including embeding layers for converting tokens into vector spaces
+class Embeddings(nn.Module):
+    '''
+    The constructor for the embeddings class, initializing a look up table that corresponds each words in the vocabulary chain to a vector
+    char: the amount of unique characters passed in
+    dimension_for_model: the desired dimension of vector that's desired to pass the word to
+    num_of_roles: the number of roles passed in
+    '''
+    def __init__ (self, char, dimension_for_model, num_of_roles = 2, max_turns = 16):
+        # Initializing parent function
+        super(Embeddings, self).__init__()
+        #creating an embedding layer and parsing the words into the matrix and dimension corresponding to the input
+        self.lut = nn.Embedding(char, dimension_for_model) #stores data into look up table
+        self.lut_roles = nn.Embedding (num_of_roles, dimension_for_model) #creating look up table for the number of roles
+        self.lut_turns = nn.Embedding (max_turns, dimension_for_model) #creating look up table for the number of turns
+        self.dimension_for_model = dimension_for_model  #stores variable
+        self.norm = nn.LayerNorm(dimension_for_model)  #defining normalization methods
+    '''
+    looks up the corresponding number from the look up table when numbers are passed in
+    x: a tensor of token indices
+    '''
+    def forward(self, x, roles, turns):
+        var = self.lut(x)  # Initialize the variable with the lookup table information of actual speaking content - parsed to words
+        var = var + self.lut(roles)  # Adding information about roles into the tensor
+        var = var + self.lut(turns) # Adding information about speaking turn into the tensor
+        # Normalizing the tensors
+        var = var*math.sqrt(self.dimension_for_model)
+        var = self.norm(var)
+        return var
+if __name__ == '__main__':
+    d_model = 512  # Desired model_dimension size definition
+    # Instead of scoping in words, move down a scope for characters, which is unarguably more beneficial
+    characters = list("abcdefghijklmnopqrstuvwxyz ")
+    # Create a mapping from each character to its index.
+    char2idx = {char: idx for idx, char in enumerate(characters)}
+    vocab = len(characters)  # The vocabulary size is the number of unique characters
+    # Create a look-up table for each character(role/speaker) within the chat
+    look_up_table_roles = {'system': 0, 'user': 1}
+    # Example input string.
+    input_str = "01 system: hello world"
+    # Splitting the conversation, position and role information from a line
+    position = int(input_str[0:2].strip())
+    input_str = input_str[2:]
+    conversation = input_str.split(':')[1].strip()
+    role = input_str.split(':')[0].strip()
+    # Convert the input string into a list of indices
+    # This filters out any character not in the vocabulary
+    # Convert the roles into reference ids using the look up table
+    conversation_indices = [char2idx[char] for char in conversation if char in char2idx]
+    position_indices = [position for char in conversation if char in char2idx]
+    role_indices = [look_up_table_roles[role] for char in conversation if char in char2idx]
+    # Create tensors from the lists of indices.
+    # Here we treat it as a batch with one sequence.
+    conversations = torch.LongTensor([conversation_indices])
+    roles = torch.LongTensor([role_indices])
+    positions = torch.LongTensor([position_indices])
+    # Initialize the embedding layer using the character-level vocabulary size.
+    emb = Embeddings(vocab, d_model)
+    embr = emb(conversations, roles, positions)
+    print("embr:", embr)

built_transformer/encoder.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#integrating the neccessary classes
+import torch
+import torch.nn as nn
+import math
+from .positional_encodings import PositionalEncoding  #import other modules neccessary for
+from .multihead_attention import MultiHeadAttention
+from .encoding_layers import position_wide_feed_forward, Residual_layer
+from .masking_for_attention import mask
+from .embeddings import Embeddings
+class EncoderLayer(nn.Module):
+    def __init__(self, dimension_for_model, num_of_heads, dim_feedforward, dropout = 0.1):
+        '''
+        dimension_for_model: the dimension desired for the model specified at the embeddings layer
+        num_of_heads: the number of heads for the multi-head-attention structure to keep track of
+        dim_feedforward: the dimension of the positional feed forward structure
+        dropout: structure for removing model dependencies during training, improving robustness
+        '''
+        super().__init__()
+        # Loading previously coded structures for multi-head attention
+        self.self_attn = MultiHeadAttention(dimension_for_model, num_of_heads, dropout)
+        self.norm1 = nn.LayerNorm(dimension_for_model)
+        self.dropout1 = nn.Dropout(dropout)
+        # Loading previously coded structures for position_wide_feed_forward
+        self.ffn = position_wide_feed_forward(dimension_for_model, dim_feedforward, dropout)
+        self.norm2 = nn.LayerNorm(dimension_for_model)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
+        # Self-attention block
+        _src = src
+        attn_output, _ = self.self_attn(src, src, src, mask=src_mask)
+        src = self.norm1(_src + self.dropout1(attn_output))  # changed attention output
+        # Feed-forward block
+        _src = src
+        ff_output = self.ffn(src)
+        src = self.norm2(_src + self.dropout2(ff_output))
+        return src
+class Encoder(nn.Module):
+    """
+    Stacked Transformer encoder:
+      - embedding + positional encoding
+      - N encoder layers
+      - final layer norm
+    """
+    def __init__(self, vocab_size, dimension_of_model, num_of_heads, num_layers, dim_feedforward = 2048, dropout = 0.1, max_len = 5000, num_of_roles=2, max_turns=16):
+        super().__init__()
+        # Token/role/turn embeddings
+        self.embed = Embeddings(vocab_size, dimension_for_model=dimension_of_model, num_of_roles=num_of_roles, max_turns=max_turns)
+        # Positional encodings (sinusoidal or learned)
+        self.pe = PositionalEncoding(dimension_of_model, dropout=dropout, max_len=max_len)
+        # Stacked encoder layers
+        self.layers = nn.ModuleList([
+            EncoderLayer(dimension_of_model, num_of_heads, dim_feedforward, dropout)
+            for _ in range(num_layers)
+        ])
+        # Final normalization
+        self.norm = nn.LayerNorm(dimension_of_model)
+    def forward(self, src_ids, roles, turns, src_mask = None) -> torch.Tensor:
+        """
+        Args:
+          src_ids: [batch_size x seq_len] input token indices
+          roles:   [batch_size x seq_len] role ids
+          turns:   [batch_size x seq_len] turn ids
+          src_mask: [batch_size, 1, 1, seq_len] mask to prevent attending to padding tokens
+        """
+        # Embed tokens, roles, and turns
+        x = self.embed(src_ids, roles, turns)
+        # Add positional information
+        x = self.pe(x)
+        # Pass through each encoder layer
+        for layer in self.layers:
+            x = layer(x, src_mask)
+        # Final layer normalization
+        return self.norm(x)
+    def load_state_dict(self, state_dict, strict=True):
+        """
+        Custom state dict loading to handle backward compatibility with old model format
+        """
+        # Check if this is an old model format (has encoder.embed.weight)
+        if 'encoder.embed.weight' in state_dict:
+            # This is an old model, we need to adapt the weights
+            old_embed_weight = state_dict['encoder.embed.weight']
+            # Copy the old embedding weights to the new structure
+            state_dict['encoder.embed.lut.weight'] = old_embed_weight
+            state_dict['encoder.embed.lut_roles.weight'] = torch.zeros_like(old_embed_weight)
+            state_dict['encoder.embed.lut_turns.weight'] = torch.zeros_like(old_embed_weight)
+            state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
+            state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
+            # Remove the old key
+            del state_dict['encoder.embed.weight']
+        return super().load_state_dict(state_dict, strict=strict)

built_transformer/encoding_layers.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#including neccessary libraires
+import math
+import torch.nn as nn
+import torch
+#using the positional feed-forward network to expand dimension for model
+class position_wide_feed_forward(nn.Module):
+    def __init__(self, dimension_for_model, dimension_for_network, dropout = 0.1):
+        '''
+        A Constructor for the positional feed forward network
+        dimensin_for_model: the manually decided dimension that's used in the embeddings layer
+        dimension_for_network: the dimension needed to expand the embedded results into
+        dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
+        '''
+        super().__init__() # Initializing the parent class - 'neural-networks'
+        self.expansion = nn.Linear(dimension_for_model, dimension_for_network) # Expanding the original batch taken from the multi-head attention into newer ones with the desired dimensions
+        self.apply_dropout = nn.Dropout(dropout) # Creating the dropout layer for improving the model's ability through testing and training by replacing specific rows and columns with 0s
+        self.activation = nn.ReLU() # Introducing non-linearity into the encoder and allowing models to represent values non-linearly
+        self.reverse_expansion = nn.Linear(dimension_for_network, dimension_for_model) # Reducing the dimension from expanded into original
+    def forward(self, x):
+        '''
+        Applying the process of the positional feed_forward function
+        x: the data which the positional feed forward is applied to
+        '''
+        parsed = self.apply_dropout(self.activation(self.expansion(x)))
+        return self.reverse_expansion(parsed)
+# Using the layer_normalization to add outputs back and then normalize the layer
+class Residual_layer (nn.Module):
+    def __init__(self, dimension_for_model, dropout = 0.1):
+        '''
+        A Constructor for the Residual and Normalization Layer
+        dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
+        dimension_for_model: The desired dimension from the embeddings layer
+        '''
+        super().__init__()
+        self.normalize = nn.LayerNorm(dimension_for_model)  # Creating the layer normalization
+        self.apply_dropout = nn.Dropout(dropout)
+    def forward(self, input_tensor, sublayer_tensor):
+        '''
+        input_tensor: the collection of tensor sum at the current stage
+        sublayer_tensor: the tensor from the specific sublayer and still needed to be added
+        '''
+        result = self.apply_dropout(sublayer_tensor)+input_tensor  # Adding two results together, since both are of same dimension to enforce the positional arguments, but also apply dropout to the new tensor being added
+        return self.normalize(result) # Return the normalized result
+if __name__ == '__main__':
+    inp = torch.tensor([[
+       [1.0, 2.0, 3.0, 4.0],
+       [0.5, 1.5, 2.5, 3.5],
+       [4.0, 3.0, 2.0, 1.0]
+    ]])
+    # Instantiate with no dropout
+    ffn = position_wide_feed_forward(dimension_for_model=4, dimension_for_network=8, dropout=0.0)
+    # Run it
+    out = ffn(inp)
+    # Print to verify shape and nontrivial transform
+    print("Input:", inp)
+    print("Output:", out)
+    print("Output shape:", out.shape)
+    x = torch.tensor([[[1.0, 2.0, 3.0, 4.0],
+                         [4.0, 3.0, 2.0, 1.0],
+                         [0.5, 1.5, 2.5, 3.5]]])
+    # Dummy “sublayer” output to add
+    sub = torch.tensor([[[0.1, 0.1, 0.1, 0.1],
+                         [0.2, 0.2, 0.2, 0.2],
+                         [0.3, 0.3, 0.3, 0.3]]])
+    # Instantiate your residual+norm block (no dropout)
+    layer = Residual_layer(dimension_for_model=4, dropout=0.0)
+    # Run
+    out = layer(x, sub)
+    # Print everything
+    print("Input X:\n", x)
+    print("\nSublayer output:\n", sub)
+    print("\nResidual+Norm output:\n", out)

built_transformer/masking_for_attention.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+import torch
+def mask(size):
+    '''
+    A function for creating a look-ahead mask, ensuring that tokens won't see future tokens during the process of training
+    through the creation of upper-triangular matrixes
+    size: number of tokens within the sequence
+    '''
+    sq_mat = (1, size, size) # Creating a square matrix filled with 1
+    mask = np.triu(np.ones(sq_mat), k=1).astype('uint8') # Turning the square matrix into an upper triangular matrix
+    return torch.from_numpy(1 - mask)

built_transformer/multihead_attention.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+import math
+class MultiHeadAttention(nn.Module):
+    def __init__(self, dimension_for_model, num_of_heads, dropout = 0.1):
+        '''
+        initializes multi-head attention module
+        dimension_for_model: the same variable as the one in the embeddings, meaning the dimensionality of the embeddings
+        num_heads:  the number of attention heads
+        dropout: as explained in positional_encodings, the dropout rate, defaulted to 0.1
+        '''
+        # Initializing the parent function
+        super(MultiHeadAttention, self).__init__()
+        assert dimension_for_model % num_of_heads == 0, "dimension_for_model must be devisible by num_of_heads"
+        self.num_of_heads = num_of_heads
+        self.dimension_for_model = dimension_for_model
+        self.d_k = dimension_for_model//num_of_heads  # This is the dimension for each head
+        # Creating linear layers for seperating data into query, key, and value
+        self.linear_query = nn.Linear(dimension_for_model, dimension_for_model)
+        self.linear_key = nn.Linear(dimension_for_model, dimension_for_model)
+        self.linear_value = nn.Linear(dimension_for_model, dimension_for_model)
+        self.linear_out = nn.Linear(dimension_for_model, dimension_for_model)  # Added linear out
+        # Adding dropout layer
+        self.dropout = nn.Dropout(dropout)
+        # Defining and applying softmax
+        self.softmax = nn.Softmax(dim = -1)
+    def forward(self, query, key, value, mask = None):
+        '''
+        Forward pass for multi-head attention.
+        query: tensors with shape batch_size, sequence_length, dimension_for_model
+        key: same as query
+        value: same as query and key
+        mask: a tensor that can be applied to attention scores
+        '''
+        batch_size = query.size(0)
+        seq_len = query.size(1)
+        # Projecting using linear layers
+        Q = self.linear_query(query)
+        K = self.linear_key(key)
+        V = self.linear_value(value)
+        # Splitting tensors into multiple heads
+        Q = Q.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
+        K = K.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
+        V = V.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
+        # Applying the attention calculation formula
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
+        # Apply mask if provided
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        # Attention weight conversion
+        attn = self.softmax(scores)
+        attn = self.dropout(attn)
+        output = torch.matmul(attn, V)
+        # Concatonating outputs for all heads
+        output = output.transpose(1,2).contiguous().view(batch_size, seq_len, self.dimension_for_model)
+        # Linear projection to combine all heads
+        output = self.linear_out(output)
+        return output, attn

built_transformer/positional_encodings.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import math
+import torch.nn as nn
+# Positional Encoding Layer
+class PositionalEncoding(nn.Module):
+    """
+    Positional encoding layer for transformer models, with adjustments for lemmatization.
+    In lemmatization tasks (especially with character-level inputs), sequences are typically
+    much shorter. Therefore, max_len is set to a lower value to better match the expected input.
+    Parameters:
+      - dimension_for_model: Dimensionality of the embedding vectors.
+      - dropout: Dropout probability used for regularization.
+      - max_len: Maximum sequence length; lowered here (e.g., 256) since lemmatization sequences are short.
+    """
+    def __init__(self, dimension_for_model, dropout, max_len=256):
+        # Initialize the parent module.
+        super(PositionalEncoding, self).__init__()
+        # Create a dropout layer.
+        self.dropout = nn.Dropout(p=dropout)
+        # Initialize a positional encoding matrix with shape (max_len, dimension_for_model).
+        pos_enc_matrix = torch.zeros(max_len, dimension_for_model)
+        # Create a column vector of positions: 0, 1, 2, ..., max_len-1.
+        position = torch.arange(0, max_len).unsqueeze(1)
+        # Calculate scaling terms for sine and cosine functions.
+        div_term = torch.exp(torch.arange(0, dimension_for_model, 2) * -(math.log(10000.0) / dimension_for_model))
+        # For even indices in the embedding dimensions, apply sine.
+        pos_enc_matrix[:, 0::2] = torch.sin(position * div_term)
+        # For odd indices, apply cosine.
+        pos_enc_matrix[:, 1::2] = torch.cos(position * div_term)
+        # Add an extra batch dimension for easier addition to input embeddings.
+        pos_enc_matrix = pos_enc_matrix.unsqueeze(0)
+        # Register the positional encoding matrix as a buffer so it's not updated by the optimizer.
+        self.register_buffer('pe', pos_enc_matrix)
+    def forward(self, x):
+        """
+        Add positional encodings to the input tensor.
+        x: Tensor of shape [batch_size, sequence_length, dimension_for_model]
+        """
+        # Add the positional encodings to the input (slice to match the input sequence length)
+        x = x + self.pe[:, :x.size(1)].detach()
+        return self.dropout(x)
+# Example usage for a lemmatization task
+if __name__ == '__main__':
+    d_model = 512       # Embedding dimension.
+    dropout_rate = 0.1  # Dropout probability.
+    max_len = 256       # Adjusted maximum sequence length for short lemmatization inputs.
+    # Instantiate the positional encoding layer with a smaller max_len suitable for lemmatization.
+    pos_encoder = PositionalEncoding(dimension_for_model=d_model, dropout=dropout_rate, max_len=max_len)
+    # Create a dummy input: a batch of 2 sequences with length 20 (for example, character-level tokens).
+    dummy_input = torch.randn(2, 20, d_model)
+    # Apply the positional encoder.
+    encoded_output = pos_encoder(dummy_input)
+    print("Encoded output shape:", encoded_output.shape)

built_transformer/slot_classifier.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SlotClassifier(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        num_slots: int,
+        hidden_dim: int = 256,
+        dropout: float = 0.1,
+        num_layers: int = 2
+    ):
+        """
+        Initialize the slot classifier.
+        input_dim: Dimension of the input features (usually dimension_of_model or d_model from transformer)
+        num_slots: Number of different slot types to classify
+        hidden_dim: Dimension of hidden layers in the MLP
+        dropout: Dropout probability for regularization
+        num_layers: Number of hidden layers in the MLP
+        """
+        super().__init__()
+        # Build MLP layers
+        layers = []
+        prev_dim = input_dim
+        # Add hidden layers
+        for _ in range(num_layers - 1):
+            layers.extend([
+                nn.Linear(prev_dim, hidden_dim),
+                nn.LayerNorm(hidden_dim),
+                nn.ReLU(),
+                nn.Dropout(dropout)
+            ])
+            prev_dim = hidden_dim
+        # Add final classification layer
+        layers.append(nn.Linear(prev_dim, num_slots))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the slot classifier.
+        x: Input tensor of shape [batch_size, input_dim]
+        Usually the [CLS] token representation from the transformer
+        """
+        logits = self.mlp(x)
+        return logits
+    def predict(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Get predictions from the classifier.
+        x: Input tensor of shape [batch_size, input_dim]
+        """
+        logits = self.forward(x)
+        return torch.argmax(logits, dim=-1)
+    def get_probabilities(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Get probability distribution over slots.
+        x: Input tensor of shape [batch_size, input_dim]
+        """
+        logits = self.forward(x)
+        return F.softmax(logits, dim=-1)

built_transformer/test_linear_function.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+import torch.nn as nn
+#simple test for the linear function to determine it's properties
+m = nn.Linear(20, 30)
+input = torch.randn(128, 20)
+output = m(input)
+print(output.size())
+print(output)

non-app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import gradio as gr
+from huggingface_hub import InferenceClient
+"""
+For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
+"""
+client = InferenceClient("literallybannedfromcallingbob/Aegis-1B-Agent")
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+):
+    # Build prompt with history and system message
+    prompt = f"{system_message}\n"
+    for user, assistant in history:
+        if user:
+            prompt += f"User: {user}\n"
+        if assistant:
+            prompt += f"Assistant: {assistant}\n"
+    prompt += f"User: {message}\nAssistant:"
+    # Call the text_generation endpoint
+    response = client.text_generation(
+        prompt,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stream=True,
+    )
+    output = ""
+    for r in response:
+        output += r.token.text
+        yield output
+"""
+For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+"""
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
+    title="Transformer Chatbot Demo (currently trained with ATIS dataset)",
+    description="Ask flight-related questions and get an answer."
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,2 @@
-huggingface_hub==0.25.2
-torch
-transformers
-tokenizers
-datasets


1	+ gradio
2	+ huggingface_hub

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer_chat.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import torch.nn as nn
+import math
+# Import neccessary layers
+from built_transformer.embeddings import Embeddings
+from built_transformer.encoder import Encoder, EncoderLayer
+from built_transformer.decoders import Decoder, DecoderLayer
+from built_transformer.positional_encodings import PositionalEncoding
+from built_transformer.slot_classifier import SlotClassifier
+class TransformerChatbot(nn.Module):
+    """
+    Unified Transformer-based chatbot model that combines:
+    - Joint token/role/turn embeddings
+    - Encoder-decoder architecture with attention
+    - Slot-filling classification
+    - Generation capabilities
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 512,
+        num_heads: int = 8,
+        d_ff: int = 2048,
+        num_encoder_layers: int = 6,
+        num_decoder_layers: int = 6,
+        num_roles: int = 2,
+        max_turns: int = 16,
+        num_slots: int = 4,
+        dropout: float = 0.1,
+        max_len: int = 5000
+    ):
+        super().__init__()
+        # Embeddings for tokens, roles, and turns
+        self.embed = Embeddings(
+            char=vocab_size, # Fixed type and name mismatch
+            dimension_for_model=d_model,
+            num_of_roles=num_roles,
+            max_turns=max_turns
+        )
+        # Positional encoding
+        self.pos_enc = PositionalEncoding(d_model, dropout, max_len)
+        # Encoder stack
+        self.encoder = Encoder(
+            vocab_size=vocab_size,
+            dimension_of_model=d_model,
+            num_of_heads=num_heads,
+            num_layers=num_encoder_layers,
+            dim_feedforward=d_ff,
+            dropout=dropout,
+            max_len=max_len,
+            num_of_roles=num_roles,
+            max_turns=max_turns
+        )
+        # Decoder stack
+        self.decoder = Decoder(
+            vocab_size=vocab_size,
+            dimension_for_model=d_model,
+            num_layers=num_decoder_layers,
+            num_of_heads=num_heads,
+            dim_feedforward=d_ff,
+            dropout=dropout,
+            max_len=max_len
+        )
+        # Output projections
+        self.out_proj = nn.Linear(d_model, vocab_size)
+        self.slot_classifier = SlotClassifier(d_model, num_slots)
+        # Initialize parameters
+        self._init_parameters()
+    def _init_parameters(self):
+        #Initialize parameters with Xavier uniform initialization
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def load_state_dict(self, state_dict, strict=True):
+        # Check if this is an old model format (has encoder.embed.weight), since previous versions uses different weights
+        if 'encoder.embed.weight' in state_dict:
+            # This is an old model, we need to adapt the weights
+            old_embed_weight = state_dict['encoder.embed.weight']
+            # Copy the old embedding weights to the new structure
+            state_dict['encoder.embed.lut.weight'] = old_embed_weight
+            # Initialize role and turn embeddings with correct sizes
+            state_dict['encoder.embed.lut_roles.weight'] = torch.zeros(2, old_embed_weight.size(1))  # 2 roles
+            state_dict['encoder.embed.lut_turns.weight'] = torch.zeros(16, old_embed_weight.size(1))  # 16 turns
+            state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
+            state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
+            # Remove the old key
+            del state_dict['encoder.embed.weight']
+        return super().load_state_dict(state_dict, strict=strict)
+    def encode(self, src_tokens, src_roles, src_turns, src_mask=None):
+        """
+        Encode source sequences with role and turn information.
+        Args:
+            src_tokens: [B, S] token IDs
+            src_roles:  [B, S] role IDs
+            src_turns:  [B, S] turn IDs
+            src_mask:   padding mask [B, 1, 1, S]
+        Returns:
+            enc_out: [B, S, d_model]
+        """
+        # Pass through encoder (embedding and positional encoding handled inside)
+        return self.encoder(src_tokens, src_roles, src_turns, src_mask)
+    def decode(
+        self,
+        tgt_tokens,
+        enc_out,
+        tgt_roles,
+        tgt_turns,
+        src_mask=None,
+        tgt_mask=None
+    ):
+        """
+        Decode target sequences with encoder context.
+        Args:
+            tgt_tokens: [B, T] target token IDs
+            enc_out:    [B, S, d_model] encoder output
+            tgt_roles:  [B, T] target role IDs
+            tgt_turns:  [B, T] target turn IDs
+            src_mask:   [B, 1, 1, S] source mask
+            tgt_mask:   [B, 1, T, T] target mask
+        Returns:
+            logits: [B, T, vocab_size]
+        """
+        # Combine embeddings
+        y = self.embed(tgt_tokens, tgt_roles, tgt_turns)
+        y = self.pos_enc(y)
+        # Pass through decoder
+        dec_out = self.decoder(tgt_tokens, enc_out, tgt_mask, src_mask)
+        return self.out_proj(dec_out)
+    def forward(
+        self,
+        src_tokens,
+        tgt_tokens,
+        src_roles,
+        tgt_roles,
+        src_turns,
+        tgt_turns,
+        src_mask=None,
+        tgt_mask=None
+    ):
+        """
+        Full forward pass combining encoding, decoding, and slot classification.
+        Args:
+            src_tokens: [B, S] source token IDs
+            tgt_tokens: [B, T] target token IDs
+            src_roles:  [B, S] source role IDs
+            tgt_roles:  [B, T] target role IDs
+            src_turns:  [B, S] source turn IDs
+            tgt_turns:  [B, T] target turn IDs
+            src_mask:   [B, 1, 1, S] source mask
+            tgt_mask:   [B, 1, T, T] target mask
+        Returns:
+            gen_logits: [B, T, vocab_size] generation logits
+            slot_logits: [B, num_slots] slot classification logits
+        """
+        # Encode source sequence
+        enc_out = self.encode(src_tokens, src_roles, src_turns, src_mask)
+        # Decode target sequence
+        gen_logits = self.decode(
+            tgt_tokens,
+            enc_out,
+            tgt_roles,
+            tgt_turns,
+            src_mask,
+            tgt_mask
+        )
+        # Use first position of encoder output for slot classification
+        cls_rep = enc_out[:, 0, :]
+        slot_logits = self.slot_classifier(cls_rep)
+        return gen_logits, slot_logits