literallybannedfromcallingbob commited on
Commit
9622166
·
1 Parent(s): 35a6fb3
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Atis_Training.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import Tokenizer
3
+ from torch.utils.data import DataLoader, Dataset
4
+ import torch
5
+ from transformer_chat import TransformerChatbot
6
+ import pandas as pd
7
+ import random
8
+
9
+ # Loading atis-datasets
10
+ raw_dataset = load_dataset("tuetschek/atis", split="train")
11
+
12
+ # Loading tokenizer from file
13
+ tokenizer = Tokenizer.from_file('tokenizer.json')
14
+
15
+ # Create synthetic responses for ATIS queries for training purposes
16
+ def create_response_for_intent(intent, text):
17
+ """Create synthetic responses for ATIS intents"""
18
+ responses = {
19
+ 'atis_flight': [
20
+ "I can help you with flight information. What specific details do you need?",
21
+ "I'll search for flights matching your criteria. Please provide departure and arrival cities.",
22
+ "Let me find available flights for you. When would you like to travel?"
23
+ ],
24
+ 'atis_flight_no': [
25
+ "I can help you with flight number information. Please provide the flight number.",
26
+ "Let me search for details about that flight number.",
27
+ "I'll look up information for that specific flight."
28
+ ],
29
+ 'atis_airfare': [
30
+ "I can help you find airfare information. What's your travel route?",
31
+ "Let me search for the best airfare options for your trip.",
32
+ "I'll check current airfare prices for your destination."
33
+ ],
34
+ 'atis_airline': [
35
+ "I can help you with airline information. Which airline are you looking for?",
36
+ "Let me provide information about that airline.",
37
+ "I'll search for details about the airline you mentioned."
38
+ ],
39
+ 'atis_abbreviation': [
40
+ "I can help you with airport abbreviations. Which abbreviation do you need?",
41
+ "Let me explain that airport abbreviation for you.",
42
+ "I'll provide the full name for that airport code."
43
+ ],
44
+ 'atis_airport': [
45
+ "I can help you with airport information. Which airport are you looking for?",
46
+ "Let me provide details about that airport.",
47
+ "I'll search for information about the airport you mentioned."
48
+ ],
49
+ 'atis_distance': [
50
+ "I can help you calculate distances between airports. Which airports are you interested in?",
51
+ "Let me calculate the distance for you.",
52
+ "I'll provide distance information between those locations."
53
+ ],
54
+ 'atis_ground_service': [
55
+ "I can help you with ground transportation services. What type of service do you need?",
56
+ "Let me find ground transportation options for you.",
57
+ "I'll search for available ground services at your destination."
58
+ ],
59
+ 'atis_aircraft': [
60
+ "I can help you with aircraft information. What type of aircraft are you looking for?",
61
+ "Let me provide details about that aircraft type.",
62
+ "I'll search for information about the aircraft you mentioned."
63
+ ],
64
+ 'atis_capacity': [
65
+ "I can help you with capacity information. What specific capacity details do you need?",
66
+ "Let me check the capacity for that flight or aircraft.",
67
+ "I'll provide capacity information for your query."
68
+ ],
69
+ 'atis_quantity': [
70
+ "I can help you with quantity information. What specific quantity are you looking for?",
71
+ "Let me check the quantity for that item or service.",
72
+ "I'll provide quantity information for your request."
73
+ ],
74
+ 'atis_meal': [
75
+ "I can help you with meal information. What type of meal service are you looking for?",
76
+ "Let me check meal options for your flight.",
77
+ "I'll provide information about meal services available."
78
+ ],
79
+ 'atis_cheapest': [
80
+ "I can help you find the cheapest options. What's your travel route?",
81
+ "Let me search for the most affordable options for your trip.",
82
+ "I'll find the cheapest flights or services for you."
83
+ ],
84
+ 'atis_restriction': [
85
+ "I can help you with travel restrictions. What type of restrictions are you asking about?",
86
+ "Let me check the restrictions for your travel plans.",
87
+ "I'll provide information about travel restrictions."
88
+ ],
89
+ 'atis_day_name': [
90
+ "I can help you with day information. What specific day are you looking for?",
91
+ "Let me check the schedule for that day.",
92
+ "I'll provide information about flights or services on that day."
93
+ ]
94
+ }
95
+
96
+ # Get base responses for the intent calssification datasets
97
+ base_responses = responses.get(intent, [
98
+ "I can help you with that. Please provide more details.",
99
+ "Let me assist you with your request.",
100
+ "I'll help you find the information you need."
101
+ ])
102
+
103
+ # For variety
104
+ if "flight" in text.lower():
105
+ base_responses.extend([
106
+ "I can help you book a flight. What are your travel dates?",
107
+ "Let me search for available flights for you.",
108
+ "I'll help you find the best flight options."
109
+ ])
110
+
111
+ return random.choice(base_responses)
112
+
113
+ # Create training data with question-answer pairs
114
+ def create_training_pairs():
115
+ training_data = []
116
+
117
+ for item in raw_dataset:
118
+ question = item['text']
119
+ intent = item['intent']
120
+ response = create_response_for_intent(intent, question)
121
+
122
+ # Tokenize question and response
123
+ question_encoding = tokenizer.encode(question)
124
+ response_encoding = tokenizer.encode(response)
125
+
126
+ # Add the specially defined tokens
127
+ question_ids = [tokenizer.token_to_id("[CLS]")] + question_encoding.ids + [tokenizer.token_to_id("[SEP]")]
128
+ response_ids = [tokenizer.token_to_id("[CLS]")] + response_encoding.ids + [tokenizer.token_to_id("[SEP]")]
129
+
130
+ training_data.append({
131
+ 'question_ids': question_ids,
132
+ 'response_ids': response_ids,
133
+ 'question_len': len(question_ids),
134
+ 'response_len': len(response_ids)
135
+ })
136
+
137
+ return training_data
138
+
139
+ # Create custom dataset for training
140
+ class AtisGenerationDataset(Dataset):
141
+ def __init__(self, training_data, tokenizer, max_length=128):
142
+ self.training_data = training_data
143
+ self.tokenizer = tokenizer
144
+ self.max_length = max_length
145
+
146
+ def __len__(self):
147
+ return len(self.training_data)
148
+
149
+ def __getitem__(self, idx):
150
+ item = self.training_data[idx]
151
+
152
+ # Pad sequences
153
+ question_ids = item['question_ids'][:self.max_length//2]
154
+ response_ids = item['response_ids'][:self.max_length//2]
155
+
156
+ # Pad with PAD token
157
+ question_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(question_ids))
158
+ response_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(response_ids))
159
+
160
+ return (
161
+ torch.tensor(question_ids),
162
+ torch.tensor(response_ids),
163
+ torch.tensor(item['question_len']),
164
+ torch.tensor(item['response_len'])
165
+ )
166
+
167
+ # Create training data
168
+ print("Creating training data...")
169
+ training_data = create_training_pairs()
170
+ print(f"Created {len(training_data)} training pairs")
171
+
172
+ # Prepare DataLoader
173
+ atis_dataset = AtisGenerationDataset(training_data, tokenizer)
174
+ dataloader = DataLoader(atis_dataset, batch_size=16, shuffle=True)
175
+
176
+ # Prepare model with all the neccessary parameters
177
+ vocab_size = tokenizer.get_vocab_size()
178
+ model = TransformerChatbot(
179
+ vocab_size=vocab_size,
180
+ d_model=512,
181
+ num_heads=8,
182
+ d_ff=2048,
183
+ num_encoder_layers=6,
184
+ num_decoder_layers=6,
185
+ num_roles=2,
186
+ max_turns=16,
187
+ num_slots=len(set(item['intent'] for item in raw_dataset)),
188
+ dropout=0.1
189
+ )
190
+
191
+ # Using gpu - cuda for training
192
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
193
+ model.to(device)
194
+
195
+ # Training loop for generation
196
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
197
+ loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))
198
+
199
+ print("Starting training...")
200
+ for epoch in range(10): # 10 epochs for fast training
201
+ model.train()
202
+ total_loss = 0
203
+ for batch_idx, (question_ids, response_ids, question_lens, response_lens) in enumerate(dataloader):
204
+ question_ids = question_ids.to(device)
205
+ response_ids = response_ids.to(device)
206
+
207
+ batch_size, seq_len = question_ids.shape
208
+
209
+ # Dummy roles and turns
210
+ roles = torch.zeros_like(question_ids)
211
+ turns = torch.zeros_like(question_ids)
212
+ # Forward pass
213
+ gen_logits, slot_logits = model(
214
+ question_ids, response_ids,
215
+ roles, roles,
216
+ turns, turns
217
+ )
218
+
219
+ # Calculate loss for generation (teacher forcing)
220
+ target_ids = response_ids[:, 1:] # Remove [CLS] token
221
+ gen_logits = gen_logits[:, :-1, :] # Remove last position
222
+ # Flatten for loss calculation
223
+ gen_logits_flat = gen_logits.reshape(-1, vocab_size)
224
+ target_ids_flat = target_ids.reshape(-1)
225
+ loss = loss_fn(gen_logits_flat, target_ids_flat)
226
+ optimizer.zero_grad()
227
+ loss.backward()
228
+ optimizer.step()
229
+ total_loss += loss.item()
230
+ if batch_idx % 100 == 0:
231
+ print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
232
+ # Averaging the losses
233
+ avg_loss = total_loss / len(dataloader)
234
+ print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
235
+
236
+ # Save model
237
+ print("Saving model...")
238
+ torch.save(model.state_dict(), 'atis_transformer.pt')
239
+ print("Training completed!")
__pycache__/transformer_chat.cpython-312.pyc ADDED
Binary file (6.46 kB). View file
 
atis_transformer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7972c60cfa6e05b74247ec320cd7f7b46871f8ee8315ea14d69cc20190b459f
3
+ size 412902527
built_transformer/__pycache__/decoders.cpython-312.pyc ADDED
Binary file (4.75 kB). View file
 
built_transformer/__pycache__/embeddings.cpython-312.pyc ADDED
Binary file (3.46 kB). View file
 
built_transformer/__pycache__/encoder.cpython-312.pyc ADDED
Binary file (5.52 kB). View file
 
built_transformer/__pycache__/encoding_layers.cpython-312.pyc ADDED
Binary file (4.33 kB). View file
 
built_transformer/__pycache__/masking_for_attention.cpython-312.pyc ADDED
Binary file (885 Bytes). View file
 
built_transformer/__pycache__/multihead_attention.cpython-312.pyc ADDED
Binary file (3.87 kB). View file
 
built_transformer/__pycache__/positional_encodings.cpython-312.pyc ADDED
Binary file (3.15 kB). View file
 
built_transformer/__pycache__/slot_classifier.cpython-312.pyc ADDED
Binary file (3.33 kB). View file
 
built_transformer/decoders.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+ from .positional_encodings import PositionalEncoding
5
+ from .encoding_layers import position_wide_feed_forward
6
+
7
+ class DecoderLayer(nn.Module):
8
+ def __init__(self, dimension_for_model, num_of_heads, dim_feedforward=2048, dropout=0.1):
9
+
10
+ '''
11
+ dimension_for_model: the desired dimension of model as specified from the embeddings layer
12
+ num_of_heads: the desired number of heads wanted from the multi-head-attention mechanism, also specified within encoders
13
+ dim_feedforward: the dimension for the feedforward module, defaulted to 2048
14
+ dropout: mechanism to remove model dependencies on other factors, defaulted to 0.1
15
+ '''
16
+
17
+ super().__init__()
18
+ self.self_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Masked self - attention
19
+ self.cross_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Encoder decoder attention
20
+ self.ffn = nn.Sequential(
21
+ nn.Linear(dimension_for_model, dim_feedforward), # Feeding forward
22
+ nn.ReLU(),
23
+ nn.Linear(dim_feedforward, dimension_for_model),
24
+ )
25
+
26
+ # Layer normalizations
27
+ self.norm1 = nn.LayerNorm(dimension_for_model)
28
+ self.norm2 = nn.LayerNorm(dimension_for_model)
29
+ self.norm3 = nn.LayerNorm(dimension_for_model)
30
+ # Dropouts
31
+ self.dropout1 = nn.Dropout(dropout)
32
+ self.dropout2 = nn.Dropout(dropout)
33
+ self.dropout3 = nn.Dropout(dropout)
34
+
35
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
36
+ tgt_t = tgt.transpose(0, 1)
37
+ memory_t = memory.transpose(0, 1)
38
+
39
+ # Masked self-attention
40
+ _tgt = tgt
41
+ tgt2, _ = self.self_attn(tgt_t, tgt_t, tgt_t, attn_mask=tgt_mask)
42
+ tgt2 = tgt2.transpose(0, 1) # Back to [batch_size, seq_len, hidden_dim]
43
+ tgt = self.norm1(_tgt + self.dropout1(tgt2))
44
+
45
+ # Cross-attention with encoder output
46
+ _tgt = tgt
47
+ tgt2, _ = self.cross_attn(tgt_t, memory_t, memory_t, attn_mask=memory_mask)
48
+ tgt2 = tgt2.transpose(0, 1) # Back to [batch_size, seq_len, hidden_dim]
49
+ tgt = self.norm2(_tgt + self.dropout2(tgt2))
50
+
51
+ # Feed-forward
52
+ _tgt = tgt
53
+ tgt2 = self.ffn(tgt)
54
+ tgt = self.norm3(_tgt + self.dropout3(tgt2))
55
+
56
+ return tgt
57
+
58
+ class Decoder(nn.Module):
59
+ def __init__(self, vocab_size, dimension_for_model, num_layers, num_of_heads, dim_feedforward=2048, dropout=0.1, max_len=5000):
60
+ super().__init__()
61
+ self.embed = nn.Embedding(vocab_size, dimension_for_model) # Embeds the data
62
+ self.pe = PositionalEncoding(dimension_for_model, dropout=dropout, max_len=max_len) # Encodes using sine and cosine functions for different positions
63
+ self.layers = nn.ModuleList([
64
+ DecoderLayer(dimension_for_model, num_of_heads, dim_feedforward, dropout)
65
+ for _ in range(num_layers)
66
+ ])
67
+ self.norm = nn.LayerNorm(dimension_for_model)
68
+
69
+ def forward(self, tgt_seq, memory, tgt_mask=None, memory_mask=None):
70
+ x = self.embed(tgt_seq) * math.sqrt(self.embed.embedding_dim) # Embedding and masking
71
+ x = self.pe(x)
72
+ for layer in self.layers: # Iterating through encoding layers
73
+ x = layer(x, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
74
+ return self.norm(x) # Layer normalization
built_transformer/embeddings.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ import torch.nn as nn
4
+ # Including embeding layers for converting tokens into vector spaces
5
+ class Embeddings(nn.Module):
6
+ '''
7
+ The constructor for the embeddings class, initializing a look up table that corresponds each words in the vocabulary chain to a vector
8
+ char: the amount of unique characters passed in
9
+ dimension_for_model: the desired dimension of vector that's desired to pass the word to
10
+ num_of_roles: the number of roles passed in
11
+ '''
12
+ def __init__ (self, char, dimension_for_model, num_of_roles = 2, max_turns = 16):
13
+ # Initializing parent function
14
+ super(Embeddings, self).__init__()
15
+ #creating an embedding layer and parsing the words into the matrix and dimension corresponding to the input
16
+ self.lut = nn.Embedding(char, dimension_for_model) #stores data into look up table
17
+ self.lut_roles = nn.Embedding (num_of_roles, dimension_for_model) #creating look up table for the number of roles
18
+ self.lut_turns = nn.Embedding (max_turns, dimension_for_model) #creating look up table for the number of turns
19
+ self.dimension_for_model = dimension_for_model #stores variable
20
+ self.norm = nn.LayerNorm(dimension_for_model) #defining normalization methods
21
+ '''
22
+ looks up the corresponding number from the look up table when numbers are passed in
23
+ x: a tensor of token indices
24
+ '''
25
+ def forward(self, x, roles, turns):
26
+ var = self.lut(x) # Initialize the variable with the lookup table information of actual speaking content - parsed to words
27
+ var = var + self.lut(roles) # Adding information about roles into the tensor
28
+ var = var + self.lut(turns) # Adding information about speaking turn into the tensor
29
+
30
+ # Normalizing the tensors
31
+ var = var*math.sqrt(self.dimension_for_model)
32
+ var = self.norm(var)
33
+ return var
34
+
35
+ if __name__ == '__main__':
36
+ d_model = 512 # Desired model_dimension size definition
37
+
38
+ # Instead of scoping in words, move down a scope for characters, which is unarguably more beneficial
39
+ characters = list("abcdefghijklmnopqrstuvwxyz ")
40
+
41
+ # Create a mapping from each character to its index.
42
+ char2idx = {char: idx for idx, char in enumerate(characters)}
43
+ vocab = len(characters) # The vocabulary size is the number of unique characters
44
+
45
+ # Create a look-up table for each character(role/speaker) within the chat
46
+ look_up_table_roles = {'system': 0, 'user': 1}
47
+
48
+ # Example input string.
49
+ input_str = "01 system: hello world"
50
+
51
+ # Splitting the conversation, position and role information from a line
52
+ position = int(input_str[0:2].strip())
53
+ input_str = input_str[2:]
54
+ conversation = input_str.split(':')[1].strip()
55
+ role = input_str.split(':')[0].strip()
56
+
57
+ # Convert the input string into a list of indices
58
+ # This filters out any character not in the vocabulary
59
+ # Convert the roles into reference ids using the look up table
60
+ conversation_indices = [char2idx[char] for char in conversation if char in char2idx]
61
+ position_indices = [position for char in conversation if char in char2idx]
62
+ role_indices = [look_up_table_roles[role] for char in conversation if char in char2idx]
63
+
64
+ # Create tensors from the lists of indices.
65
+ # Here we treat it as a batch with one sequence.
66
+ conversations = torch.LongTensor([conversation_indices])
67
+ roles = torch.LongTensor([role_indices])
68
+ positions = torch.LongTensor([position_indices])
69
+
70
+ # Initialize the embedding layer using the character-level vocabulary size.
71
+ emb = Embeddings(vocab, d_model)
72
+ embr = emb(conversations, roles, positions)
73
+
74
+ print("embr:", embr)
75
+
built_transformer/encoder.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #integrating the neccessary classes
2
+ import torch
3
+ import torch.nn as nn
4
+ import math
5
+ from .positional_encodings import PositionalEncoding #import other modules neccessary for
6
+ from .multihead_attention import MultiHeadAttention
7
+ from .encoding_layers import position_wide_feed_forward, Residual_layer
8
+ from .masking_for_attention import mask
9
+ from .embeddings import Embeddings
10
+
11
+ class EncoderLayer(nn.Module):
12
+ def __init__(self, dimension_for_model, num_of_heads, dim_feedforward, dropout = 0.1):
13
+ '''
14
+ dimension_for_model: the dimension desired for the model specified at the embeddings layer
15
+ num_of_heads: the number of heads for the multi-head-attention structure to keep track of
16
+ dim_feedforward: the dimension of the positional feed forward structure
17
+ dropout: structure for removing model dependencies during training, improving robustness
18
+ '''
19
+ super().__init__()
20
+ # Loading previously coded structures for multi-head attention
21
+ self.self_attn = MultiHeadAttention(dimension_for_model, num_of_heads, dropout)
22
+ self.norm1 = nn.LayerNorm(dimension_for_model)
23
+ self.dropout1 = nn.Dropout(dropout)
24
+ # Loading previously coded structures for position_wide_feed_forward
25
+ self.ffn = position_wide_feed_forward(dimension_for_model, dim_feedforward, dropout)
26
+ self.norm2 = nn.LayerNorm(dimension_for_model)
27
+ self.dropout2 = nn.Dropout(dropout)
28
+
29
+ def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
30
+ # Self-attention block
31
+ _src = src
32
+ attn_output, _ = self.self_attn(src, src, src, mask=src_mask)
33
+ src = self.norm1(_src + self.dropout1(attn_output)) # changed attention output
34
+ # Feed-forward block
35
+ _src = src
36
+ ff_output = self.ffn(src)
37
+ src = self.norm2(_src + self.dropout2(ff_output))
38
+ return src
39
+
40
+
41
+ class Encoder(nn.Module):
42
+ """
43
+ Stacked Transformer encoder:
44
+ - embedding + positional encoding
45
+ - N encoder layers
46
+ - final layer norm
47
+ """
48
+ def __init__(self, vocab_size, dimension_of_model, num_of_heads, num_layers, dim_feedforward = 2048, dropout = 0.1, max_len = 5000, num_of_roles=2, max_turns=16):
49
+ super().__init__()
50
+ # Token/role/turn embeddings
51
+ self.embed = Embeddings(vocab_size, dimension_for_model=dimension_of_model, num_of_roles=num_of_roles, max_turns=max_turns)
52
+ # Positional encodings (sinusoidal or learned)
53
+ self.pe = PositionalEncoding(dimension_of_model, dropout=dropout, max_len=max_len)
54
+ # Stacked encoder layers
55
+ self.layers = nn.ModuleList([
56
+ EncoderLayer(dimension_of_model, num_of_heads, dim_feedforward, dropout)
57
+ for _ in range(num_layers)
58
+ ])
59
+ # Final normalization
60
+ self.norm = nn.LayerNorm(dimension_of_model)
61
+
62
+ def forward(self, src_ids, roles, turns, src_mask = None) -> torch.Tensor:
63
+ """
64
+ Args:
65
+ src_ids: [batch_size x seq_len] input token indices
66
+ roles: [batch_size x seq_len] role ids
67
+ turns: [batch_size x seq_len] turn ids
68
+ src_mask: [batch_size, 1, 1, seq_len] mask to prevent attending to padding tokens
69
+ """
70
+ # Embed tokens, roles, and turns
71
+ x = self.embed(src_ids, roles, turns)
72
+ # Add positional information
73
+ x = self.pe(x)
74
+ # Pass through each encoder layer
75
+ for layer in self.layers:
76
+ x = layer(x, src_mask)
77
+ # Final layer normalization
78
+ return self.norm(x)
79
+
80
+ def load_state_dict(self, state_dict, strict=True):
81
+ """
82
+ Custom state dict loading to handle backward compatibility with old model format
83
+ """
84
+ # Check if this is an old model format (has encoder.embed.weight)
85
+ if 'encoder.embed.weight' in state_dict:
86
+ # This is an old model, we need to adapt the weights
87
+ old_embed_weight = state_dict['encoder.embed.weight']
88
+
89
+ # Copy the old embedding weights to the new structure
90
+ state_dict['encoder.embed.lut.weight'] = old_embed_weight
91
+ state_dict['encoder.embed.lut_roles.weight'] = torch.zeros_like(old_embed_weight)
92
+ state_dict['encoder.embed.lut_turns.weight'] = torch.zeros_like(old_embed_weight)
93
+ state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
94
+ state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
95
+
96
+ # Remove the old key
97
+ del state_dict['encoder.embed.weight']
98
+
99
+ return super().load_state_dict(state_dict, strict=strict)
100
+
built_transformer/encoding_layers.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #including neccessary libraires
2
+ import math
3
+ import torch.nn as nn
4
+ import torch
5
+
6
+ #using the positional feed-forward network to expand dimension for model
7
+ class position_wide_feed_forward(nn.Module):
8
+ def __init__(self, dimension_for_model, dimension_for_network, dropout = 0.1):
9
+
10
+ '''
11
+ A Constructor for the positional feed forward network
12
+ dimensin_for_model: the manually decided dimension that's used in the embeddings layer
13
+ dimension_for_network: the dimension needed to expand the embedded results into
14
+ dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
15
+ '''
16
+
17
+ super().__init__() # Initializing the parent class - 'neural-networks'
18
+ self.expansion = nn.Linear(dimension_for_model, dimension_for_network) # Expanding the original batch taken from the multi-head attention into newer ones with the desired dimensions
19
+ self.apply_dropout = nn.Dropout(dropout) # Creating the dropout layer for improving the model's ability through testing and training by replacing specific rows and columns with 0s
20
+ self.activation = nn.ReLU() # Introducing non-linearity into the encoder and allowing models to represent values non-linearly
21
+ self.reverse_expansion = nn.Linear(dimension_for_network, dimension_for_model) # Reducing the dimension from expanded into original
22
+ def forward(self, x):
23
+
24
+ '''
25
+ Applying the process of the positional feed_forward function
26
+ x: the data which the positional feed forward is applied to
27
+ '''
28
+ parsed = self.apply_dropout(self.activation(self.expansion(x)))
29
+ return self.reverse_expansion(parsed)
30
+ # Using the layer_normalization to add outputs back and then normalize the layer
31
+
32
+ class Residual_layer (nn.Module):
33
+ def __init__(self, dimension_for_model, dropout = 0.1):
34
+ '''
35
+ A Constructor for the Residual and Normalization Layer
36
+ dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
37
+ dimension_for_model: The desired dimension from the embeddings layer
38
+ '''
39
+ super().__init__()
40
+ self.normalize = nn.LayerNorm(dimension_for_model) # Creating the layer normalization
41
+ self.apply_dropout = nn.Dropout(dropout)
42
+ def forward(self, input_tensor, sublayer_tensor):
43
+ '''
44
+ input_tensor: the collection of tensor sum at the current stage
45
+ sublayer_tensor: the tensor from the specific sublayer and still needed to be added
46
+ '''
47
+ result = self.apply_dropout(sublayer_tensor)+input_tensor # Adding two results together, since both are of same dimension to enforce the positional arguments, but also apply dropout to the new tensor being added
48
+ return self.normalize(result) # Return the normalized result
49
+
50
+
51
+
52
+ if __name__ == '__main__':
53
+ inp = torch.tensor([[
54
+ [1.0, 2.0, 3.0, 4.0],
55
+ [0.5, 1.5, 2.5, 3.5],
56
+ [4.0, 3.0, 2.0, 1.0]
57
+ ]])
58
+
59
+ # Instantiate with no dropout
60
+ ffn = position_wide_feed_forward(dimension_for_model=4, dimension_for_network=8, dropout=0.0)
61
+
62
+ # Run it
63
+ out = ffn(inp)
64
+
65
+ # Print to verify shape and nontrivial transform
66
+ print("Input:", inp)
67
+ print("Output:", out)
68
+ print("Output shape:", out.shape)
69
+ x = torch.tensor([[[1.0, 2.0, 3.0, 4.0],
70
+ [4.0, 3.0, 2.0, 1.0],
71
+ [0.5, 1.5, 2.5, 3.5]]])
72
+
73
+ # Dummy “sublayer” output to add
74
+ sub = torch.tensor([[[0.1, 0.1, 0.1, 0.1],
75
+ [0.2, 0.2, 0.2, 0.2],
76
+ [0.3, 0.3, 0.3, 0.3]]])
77
+
78
+ # Instantiate your residual+norm block (no dropout)
79
+ layer = Residual_layer(dimension_for_model=4, dropout=0.0)
80
+
81
+ # Run
82
+ out = layer(x, sub)
83
+
84
+ # Print everything
85
+ print("Input X:\n", x)
86
+ print("\nSublayer output:\n", sub)
87
+ print("\nResidual+Norm output:\n", out)
88
+
89
+
90
+
built_transformer/masking_for_attention.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ def mask(size):
5
+ '''
6
+ A function for creating a look-ahead mask, ensuring that tokens won't see future tokens during the process of training
7
+ through the creation of upper-triangular matrixes
8
+ size: number of tokens within the sequence
9
+ '''
10
+ sq_mat = (1, size, size) # Creating a square matrix filled with 1
11
+ mask = np.triu(np.ones(sq_mat), k=1).astype('uint8') # Turning the square matrix into an upper triangular matrix
12
+ return torch.from_numpy(1 - mask)
built_transformer/multihead_attention.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ class MultiHeadAttention(nn.Module):
6
+ def __init__(self, dimension_for_model, num_of_heads, dropout = 0.1):
7
+ '''
8
+ initializes multi-head attention module
9
+ dimension_for_model: the same variable as the one in the embeddings, meaning the dimensionality of the embeddings
10
+ num_heads: the number of attention heads
11
+ dropout: as explained in positional_encodings, the dropout rate, defaulted to 0.1
12
+ '''
13
+
14
+ # Initializing the parent function
15
+ super(MultiHeadAttention, self).__init__()
16
+ assert dimension_for_model % num_of_heads == 0, "dimension_for_model must be devisible by num_of_heads"
17
+
18
+ self.num_of_heads = num_of_heads
19
+ self.dimension_for_model = dimension_for_model
20
+ self.d_k = dimension_for_model//num_of_heads # This is the dimension for each head
21
+
22
+ # Creating linear layers for seperating data into query, key, and value
23
+ self.linear_query = nn.Linear(dimension_for_model, dimension_for_model)
24
+ self.linear_key = nn.Linear(dimension_for_model, dimension_for_model)
25
+ self.linear_value = nn.Linear(dimension_for_model, dimension_for_model)
26
+ self.linear_out = nn.Linear(dimension_for_model, dimension_for_model) # Added linear out
27
+
28
+ # Adding dropout layer
29
+ self.dropout = nn.Dropout(dropout)
30
+ # Defining and applying softmax
31
+ self.softmax = nn.Softmax(dim = -1)
32
+
33
+ def forward(self, query, key, value, mask = None):
34
+ '''
35
+ Forward pass for multi-head attention.
36
+ query: tensors with shape batch_size, sequence_length, dimension_for_model
37
+ key: same as query
38
+ value: same as query and key
39
+ mask: a tensor that can be applied to attention scores
40
+ '''
41
+ batch_size = query.size(0)
42
+ seq_len = query.size(1)
43
+
44
+ # Projecting using linear layers
45
+ Q = self.linear_query(query)
46
+ K = self.linear_key(key)
47
+ V = self.linear_value(value)
48
+
49
+ # Splitting tensors into multiple heads
50
+ Q = Q.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
51
+ K = K.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
52
+ V = V.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
53
+ # Applying the attention calculation formula
54
+ scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
55
+
56
+ # Apply mask if provided
57
+ if mask is not None:
58
+ scores = scores.masked_fill(mask == 0, -1e9)
59
+
60
+ # Attention weight conversion
61
+ attn = self.softmax(scores)
62
+ attn = self.dropout(attn)
63
+
64
+ output = torch.matmul(attn, V)
65
+
66
+ # Concatonating outputs for all heads
67
+ output = output.transpose(1,2).contiguous().view(batch_size, seq_len, self.dimension_for_model)
68
+
69
+ # Linear projection to combine all heads
70
+ output = self.linear_out(output)
71
+
72
+ return output, attn
built_transformer/positional_encodings.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ import torch.nn as nn
4
+
5
+ # Positional Encoding Layer
6
+ class PositionalEncoding(nn.Module):
7
+ """
8
+ Positional encoding layer for transformer models, with adjustments for lemmatization.
9
+ In lemmatization tasks (especially with character-level inputs), sequences are typically
10
+ much shorter. Therefore, max_len is set to a lower value to better match the expected input.
11
+
12
+ Parameters:
13
+ - dimension_for_model: Dimensionality of the embedding vectors.
14
+ - dropout: Dropout probability used for regularization.
15
+ - max_len: Maximum sequence length; lowered here (e.g., 256) since lemmatization sequences are short.
16
+ """
17
+ def __init__(self, dimension_for_model, dropout, max_len=256):
18
+ # Initialize the parent module.
19
+ super(PositionalEncoding, self).__init__()
20
+ # Create a dropout layer.
21
+ self.dropout = nn.Dropout(p=dropout)
22
+
23
+ # Initialize a positional encoding matrix with shape (max_len, dimension_for_model).
24
+ pos_enc_matrix = torch.zeros(max_len, dimension_for_model)
25
+
26
+ # Create a column vector of positions: 0, 1, 2, ..., max_len-1.
27
+ position = torch.arange(0, max_len).unsqueeze(1)
28
+
29
+ # Calculate scaling terms for sine and cosine functions.
30
+ div_term = torch.exp(torch.arange(0, dimension_for_model, 2) * -(math.log(10000.0) / dimension_for_model))
31
+
32
+ # For even indices in the embedding dimensions, apply sine.
33
+ pos_enc_matrix[:, 0::2] = torch.sin(position * div_term)
34
+ # For odd indices, apply cosine.
35
+ pos_enc_matrix[:, 1::2] = torch.cos(position * div_term)
36
+
37
+ # Add an extra batch dimension for easier addition to input embeddings.
38
+ pos_enc_matrix = pos_enc_matrix.unsqueeze(0)
39
+
40
+ # Register the positional encoding matrix as a buffer so it's not updated by the optimizer.
41
+ self.register_buffer('pe', pos_enc_matrix)
42
+
43
+ def forward(self, x):
44
+ """
45
+ Add positional encodings to the input tensor.
46
+ x: Tensor of shape [batch_size, sequence_length, dimension_for_model]
47
+ """
48
+ # Add the positional encodings to the input (slice to match the input sequence length)
49
+ x = x + self.pe[:, :x.size(1)].detach()
50
+ return self.dropout(x)
51
+
52
+
53
+ # Example usage for a lemmatization task
54
+ if __name__ == '__main__':
55
+ d_model = 512 # Embedding dimension.
56
+ dropout_rate = 0.1 # Dropout probability.
57
+ max_len = 256 # Adjusted maximum sequence length for short lemmatization inputs.
58
+
59
+ # Instantiate the positional encoding layer with a smaller max_len suitable for lemmatization.
60
+ pos_encoder = PositionalEncoding(dimension_for_model=d_model, dropout=dropout_rate, max_len=max_len)
61
+
62
+ # Create a dummy input: a batch of 2 sequences with length 20 (for example, character-level tokens).
63
+ dummy_input = torch.randn(2, 20, d_model)
64
+
65
+ # Apply the positional encoder.
66
+ encoded_output = pos_encoder(dummy_input)
67
+
68
+ print("Encoded output shape:", encoded_output.shape)
built_transformer/slot_classifier.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class SlotClassifier(nn.Module):
6
+
7
+ def __init__(
8
+ self,
9
+ input_dim: int,
10
+ num_slots: int,
11
+ hidden_dim: int = 256,
12
+ dropout: float = 0.1,
13
+ num_layers: int = 2
14
+ ):
15
+ """
16
+ Initialize the slot classifier.
17
+ input_dim: Dimension of the input features (usually dimension_of_model or d_model from transformer)
18
+ num_slots: Number of different slot types to classify
19
+ hidden_dim: Dimension of hidden layers in the MLP
20
+ dropout: Dropout probability for regularization
21
+ num_layers: Number of hidden layers in the MLP
22
+ """
23
+ super().__init__()
24
+
25
+ # Build MLP layers
26
+ layers = []
27
+ prev_dim = input_dim
28
+
29
+ # Add hidden layers
30
+ for _ in range(num_layers - 1):
31
+ layers.extend([
32
+ nn.Linear(prev_dim, hidden_dim),
33
+ nn.LayerNorm(hidden_dim),
34
+ nn.ReLU(),
35
+ nn.Dropout(dropout)
36
+ ])
37
+ prev_dim = hidden_dim
38
+
39
+ # Add final classification layer
40
+ layers.append(nn.Linear(prev_dim, num_slots))
41
+
42
+ self.mlp = nn.Sequential(*layers)
43
+
44
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
45
+ """
46
+ Forward pass of the slot classifier.
47
+ x: Input tensor of shape [batch_size, input_dim]
48
+ Usually the [CLS] token representation from the transformer
49
+ """
50
+ logits = self.mlp(x)
51
+ return logits
52
+
53
+ def predict(self, x: torch.Tensor) -> torch.Tensor:
54
+ """
55
+ Get predictions from the classifier.
56
+ x: Input tensor of shape [batch_size, input_dim]
57
+ """
58
+ logits = self.forward(x)
59
+ return torch.argmax(logits, dim=-1)
60
+
61
+ def get_probabilities(self, x: torch.Tensor) -> torch.Tensor:
62
+ """
63
+ Get probability distribution over slots.
64
+ x: Input tensor of shape [batch_size, input_dim]
65
+ """
66
+ logits = self.forward(x)
67
+ return F.softmax(logits, dim=-1)
built_transformer/test_linear_function.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ #simple test for the linear function to determine it's properties
4
+ m = nn.Linear(20, 30)
5
+ input = torch.randn(128, 20)
6
+ output = m(input)
7
+ print(output.size())
8
+ print(output)
non-app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+
4
+ """
5
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
+ """
7
+ client = InferenceClient("literallybannedfromcallingbob/Aegis-1B-Agent")
8
+
9
+
10
+ def respond(
11
+ message,
12
+ history: list[tuple[str, str]],
13
+ system_message,
14
+ max_tokens,
15
+ temperature,
16
+ top_p,
17
+ ):
18
+ # Build prompt with history and system message
19
+ prompt = f"{system_message}\n"
20
+ for user, assistant in history:
21
+ if user:
22
+ prompt += f"User: {user}\n"
23
+ if assistant:
24
+ prompt += f"Assistant: {assistant}\n"
25
+ prompt += f"User: {message}\nAssistant:"
26
+
27
+ # Call the text_generation endpoint
28
+ response = client.text_generation(
29
+ prompt,
30
+ max_new_tokens=max_tokens,
31
+ temperature=temperature,
32
+ top_p=top_p,
33
+ stream=True,
34
+ )
35
+ output = ""
36
+ for r in response:
37
+ output += r.token.text
38
+ yield output
39
+
40
+
41
+ """
42
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
43
+ """
44
+ demo = gr.ChatInterface(
45
+ respond,
46
+ additional_inputs=[
47
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
48
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
49
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
50
+ gr.Slider(
51
+ minimum=0.1,
52
+ maximum=1.0,
53
+ value=0.95,
54
+ step=0.05,
55
+ label="Top-p (nucleus sampling)",
56
+ ),
57
+ ],
58
+ title="Transformer Chatbot Demo (currently trained with ATIS dataset)",
59
+ description="Ask flight-related questions and get an answer."
60
+ )
61
+
62
+
63
+ if __name__ == "__main__":
64
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,2 @@
1
- huggingface_hub==0.25.2
2
- torch
3
- transformers
4
- tokenizers
5
- datasets
 
1
+ gradio
2
+ huggingface_hub
 
 
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer_chat.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ # Import neccessary layers
6
+ from built_transformer.embeddings import Embeddings
7
+ from built_transformer.encoder import Encoder, EncoderLayer
8
+ from built_transformer.decoders import Decoder, DecoderLayer
9
+ from built_transformer.positional_encodings import PositionalEncoding
10
+ from built_transformer.slot_classifier import SlotClassifier
11
+
12
+ class TransformerChatbot(nn.Module):
13
+ """
14
+ Unified Transformer-based chatbot model that combines:
15
+ - Joint token/role/turn embeddings
16
+ - Encoder-decoder architecture with attention
17
+ - Slot-filling classification
18
+ - Generation capabilities
19
+ """
20
+ def __init__(
21
+ self,
22
+ vocab_size: int,
23
+ d_model: int = 512,
24
+ num_heads: int = 8,
25
+ d_ff: int = 2048,
26
+ num_encoder_layers: int = 6,
27
+ num_decoder_layers: int = 6,
28
+ num_roles: int = 2,
29
+ max_turns: int = 16,
30
+ num_slots: int = 4,
31
+ dropout: float = 0.1,
32
+ max_len: int = 5000
33
+ ):
34
+ super().__init__()
35
+
36
+ # Embeddings for tokens, roles, and turns
37
+ self.embed = Embeddings(
38
+ char=vocab_size, # Fixed type and name mismatch
39
+ dimension_for_model=d_model,
40
+ num_of_roles=num_roles,
41
+ max_turns=max_turns
42
+ )
43
+
44
+ # Positional encoding
45
+ self.pos_enc = PositionalEncoding(d_model, dropout, max_len)
46
+
47
+ # Encoder stack
48
+ self.encoder = Encoder(
49
+ vocab_size=vocab_size,
50
+ dimension_of_model=d_model,
51
+ num_of_heads=num_heads,
52
+ num_layers=num_encoder_layers,
53
+ dim_feedforward=d_ff,
54
+ dropout=dropout,
55
+ max_len=max_len,
56
+ num_of_roles=num_roles,
57
+ max_turns=max_turns
58
+ )
59
+
60
+ # Decoder stack
61
+ self.decoder = Decoder(
62
+ vocab_size=vocab_size,
63
+ dimension_for_model=d_model,
64
+ num_layers=num_decoder_layers,
65
+ num_of_heads=num_heads,
66
+ dim_feedforward=d_ff,
67
+ dropout=dropout,
68
+ max_len=max_len
69
+ )
70
+
71
+ # Output projections
72
+ self.out_proj = nn.Linear(d_model, vocab_size)
73
+ self.slot_classifier = SlotClassifier(d_model, num_slots)
74
+
75
+ # Initialize parameters
76
+ self._init_parameters()
77
+
78
+ def _init_parameters(self):
79
+ #Initialize parameters with Xavier uniform initialization
80
+ for p in self.parameters():
81
+ if p.dim() > 1:
82
+ nn.init.xavier_uniform_(p)
83
+
84
+ def load_state_dict(self, state_dict, strict=True):
85
+ # Check if this is an old model format (has encoder.embed.weight), since previous versions uses different weights
86
+ if 'encoder.embed.weight' in state_dict:
87
+ # This is an old model, we need to adapt the weights
88
+ old_embed_weight = state_dict['encoder.embed.weight']
89
+
90
+ # Copy the old embedding weights to the new structure
91
+ state_dict['encoder.embed.lut.weight'] = old_embed_weight
92
+ # Initialize role and turn embeddings with correct sizes
93
+ state_dict['encoder.embed.lut_roles.weight'] = torch.zeros(2, old_embed_weight.size(1)) # 2 roles
94
+ state_dict['encoder.embed.lut_turns.weight'] = torch.zeros(16, old_embed_weight.size(1)) # 16 turns
95
+ state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
96
+ state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
97
+
98
+ # Remove the old key
99
+ del state_dict['encoder.embed.weight']
100
+
101
+ return super().load_state_dict(state_dict, strict=strict)
102
+
103
+ def encode(self, src_tokens, src_roles, src_turns, src_mask=None):
104
+ """
105
+ Encode source sequences with role and turn information.
106
+ Args:
107
+ src_tokens: [B, S] token IDs
108
+ src_roles: [B, S] role IDs
109
+ src_turns: [B, S] turn IDs
110
+ src_mask: padding mask [B, 1, 1, S]
111
+ Returns:
112
+ enc_out: [B, S, d_model]
113
+ """
114
+ # Pass through encoder (embedding and positional encoding handled inside)
115
+ return self.encoder(src_tokens, src_roles, src_turns, src_mask)
116
+
117
+ def decode(
118
+ self,
119
+ tgt_tokens,
120
+ enc_out,
121
+ tgt_roles,
122
+ tgt_turns,
123
+ src_mask=None,
124
+ tgt_mask=None
125
+ ):
126
+ """
127
+ Decode target sequences with encoder context.
128
+ Args:
129
+ tgt_tokens: [B, T] target token IDs
130
+ enc_out: [B, S, d_model] encoder output
131
+ tgt_roles: [B, T] target role IDs
132
+ tgt_turns: [B, T] target turn IDs
133
+ src_mask: [B, 1, 1, S] source mask
134
+ tgt_mask: [B, 1, T, T] target mask
135
+ Returns:
136
+ logits: [B, T, vocab_size]
137
+ """
138
+ # Combine embeddings
139
+ y = self.embed(tgt_tokens, tgt_roles, tgt_turns)
140
+ y = self.pos_enc(y)
141
+
142
+ # Pass through decoder
143
+ dec_out = self.decoder(tgt_tokens, enc_out, tgt_mask, src_mask)
144
+ return self.out_proj(dec_out)
145
+
146
+ def forward(
147
+ self,
148
+ src_tokens,
149
+ tgt_tokens,
150
+ src_roles,
151
+ tgt_roles,
152
+ src_turns,
153
+ tgt_turns,
154
+ src_mask=None,
155
+ tgt_mask=None
156
+ ):
157
+ """
158
+ Full forward pass combining encoding, decoding, and slot classification.
159
+ Args:
160
+ src_tokens: [B, S] source token IDs
161
+ tgt_tokens: [B, T] target token IDs
162
+ src_roles: [B, S] source role IDs
163
+ tgt_roles: [B, T] target role IDs
164
+ src_turns: [B, S] source turn IDs
165
+ tgt_turns: [B, T] target turn IDs
166
+ src_mask: [B, 1, 1, S] source mask
167
+ tgt_mask: [B, 1, T, T] target mask
168
+ Returns:
169
+ gen_logits: [B, T, vocab_size] generation logits
170
+ slot_logits: [B, num_slots] slot classification logits
171
+ """
172
+ # Encode source sequence
173
+ enc_out = self.encode(src_tokens, src_roles, src_turns, src_mask)
174
+
175
+ # Decode target sequence
176
+ gen_logits = self.decode(
177
+ tgt_tokens,
178
+ enc_out,
179
+ tgt_roles,
180
+ tgt_turns,
181
+ src_mask,
182
+ tgt_mask
183
+ )
184
+
185
+ # Use first position of encoder output for slot classification
186
+ cls_rep = enc_out[:, 0, :]
187
+ slot_logits = self.slot_classifier(cls_rep)
188
+
189
+ return gen_logits, slot_logits