Commit ·
9622166
1
Parent(s): 35a6fb3
updated
Browse files- .gradio/certificate.pem +31 -0
- Atis_Training.py +239 -0
- __pycache__/transformer_chat.cpython-312.pyc +0 -0
- atis_transformer.pt +3 -0
- built_transformer/__pycache__/decoders.cpython-312.pyc +0 -0
- built_transformer/__pycache__/embeddings.cpython-312.pyc +0 -0
- built_transformer/__pycache__/encoder.cpython-312.pyc +0 -0
- built_transformer/__pycache__/encoding_layers.cpython-312.pyc +0 -0
- built_transformer/__pycache__/masking_for_attention.cpython-312.pyc +0 -0
- built_transformer/__pycache__/multihead_attention.cpython-312.pyc +0 -0
- built_transformer/__pycache__/positional_encodings.cpython-312.pyc +0 -0
- built_transformer/__pycache__/slot_classifier.cpython-312.pyc +0 -0
- built_transformer/decoders.py +74 -0
- built_transformer/embeddings.py +75 -0
- built_transformer/encoder.py +100 -0
- built_transformer/encoding_layers.py +90 -0
- built_transformer/masking_for_attention.py +12 -0
- built_transformer/multihead_attention.py +72 -0
- built_transformer/positional_encodings.py +68 -0
- built_transformer/slot_classifier.py +67 -0
- built_transformer/test_linear_function.py +8 -0
- non-app.py +64 -0
- requirements.txt +2 -5
- tokenizer.json +0 -0
- transformer_chat.py +189 -0
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
Atis_Training.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from tokenizers import Tokenizer
|
| 3 |
+
from torch.utils.data import DataLoader, Dataset
|
| 4 |
+
import torch
|
| 5 |
+
from transformer_chat import TransformerChatbot
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
# Loading atis-datasets
|
| 10 |
+
raw_dataset = load_dataset("tuetschek/atis", split="train")
|
| 11 |
+
|
| 12 |
+
# Loading tokenizer from file
|
| 13 |
+
tokenizer = Tokenizer.from_file('tokenizer.json')
|
| 14 |
+
|
| 15 |
+
# Create synthetic responses for ATIS queries for training purposes
|
| 16 |
+
def create_response_for_intent(intent, text):
|
| 17 |
+
"""Create synthetic responses for ATIS intents"""
|
| 18 |
+
responses = {
|
| 19 |
+
'atis_flight': [
|
| 20 |
+
"I can help you with flight information. What specific details do you need?",
|
| 21 |
+
"I'll search for flights matching your criteria. Please provide departure and arrival cities.",
|
| 22 |
+
"Let me find available flights for you. When would you like to travel?"
|
| 23 |
+
],
|
| 24 |
+
'atis_flight_no': [
|
| 25 |
+
"I can help you with flight number information. Please provide the flight number.",
|
| 26 |
+
"Let me search for details about that flight number.",
|
| 27 |
+
"I'll look up information for that specific flight."
|
| 28 |
+
],
|
| 29 |
+
'atis_airfare': [
|
| 30 |
+
"I can help you find airfare information. What's your travel route?",
|
| 31 |
+
"Let me search for the best airfare options for your trip.",
|
| 32 |
+
"I'll check current airfare prices for your destination."
|
| 33 |
+
],
|
| 34 |
+
'atis_airline': [
|
| 35 |
+
"I can help you with airline information. Which airline are you looking for?",
|
| 36 |
+
"Let me provide information about that airline.",
|
| 37 |
+
"I'll search for details about the airline you mentioned."
|
| 38 |
+
],
|
| 39 |
+
'atis_abbreviation': [
|
| 40 |
+
"I can help you with airport abbreviations. Which abbreviation do you need?",
|
| 41 |
+
"Let me explain that airport abbreviation for you.",
|
| 42 |
+
"I'll provide the full name for that airport code."
|
| 43 |
+
],
|
| 44 |
+
'atis_airport': [
|
| 45 |
+
"I can help you with airport information. Which airport are you looking for?",
|
| 46 |
+
"Let me provide details about that airport.",
|
| 47 |
+
"I'll search for information about the airport you mentioned."
|
| 48 |
+
],
|
| 49 |
+
'atis_distance': [
|
| 50 |
+
"I can help you calculate distances between airports. Which airports are you interested in?",
|
| 51 |
+
"Let me calculate the distance for you.",
|
| 52 |
+
"I'll provide distance information between those locations."
|
| 53 |
+
],
|
| 54 |
+
'atis_ground_service': [
|
| 55 |
+
"I can help you with ground transportation services. What type of service do you need?",
|
| 56 |
+
"Let me find ground transportation options for you.",
|
| 57 |
+
"I'll search for available ground services at your destination."
|
| 58 |
+
],
|
| 59 |
+
'atis_aircraft': [
|
| 60 |
+
"I can help you with aircraft information. What type of aircraft are you looking for?",
|
| 61 |
+
"Let me provide details about that aircraft type.",
|
| 62 |
+
"I'll search for information about the aircraft you mentioned."
|
| 63 |
+
],
|
| 64 |
+
'atis_capacity': [
|
| 65 |
+
"I can help you with capacity information. What specific capacity details do you need?",
|
| 66 |
+
"Let me check the capacity for that flight or aircraft.",
|
| 67 |
+
"I'll provide capacity information for your query."
|
| 68 |
+
],
|
| 69 |
+
'atis_quantity': [
|
| 70 |
+
"I can help you with quantity information. What specific quantity are you looking for?",
|
| 71 |
+
"Let me check the quantity for that item or service.",
|
| 72 |
+
"I'll provide quantity information for your request."
|
| 73 |
+
],
|
| 74 |
+
'atis_meal': [
|
| 75 |
+
"I can help you with meal information. What type of meal service are you looking for?",
|
| 76 |
+
"Let me check meal options for your flight.",
|
| 77 |
+
"I'll provide information about meal services available."
|
| 78 |
+
],
|
| 79 |
+
'atis_cheapest': [
|
| 80 |
+
"I can help you find the cheapest options. What's your travel route?",
|
| 81 |
+
"Let me search for the most affordable options for your trip.",
|
| 82 |
+
"I'll find the cheapest flights or services for you."
|
| 83 |
+
],
|
| 84 |
+
'atis_restriction': [
|
| 85 |
+
"I can help you with travel restrictions. What type of restrictions are you asking about?",
|
| 86 |
+
"Let me check the restrictions for your travel plans.",
|
| 87 |
+
"I'll provide information about travel restrictions."
|
| 88 |
+
],
|
| 89 |
+
'atis_day_name': [
|
| 90 |
+
"I can help you with day information. What specific day are you looking for?",
|
| 91 |
+
"Let me check the schedule for that day.",
|
| 92 |
+
"I'll provide information about flights or services on that day."
|
| 93 |
+
]
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Get base responses for the intent calssification datasets
|
| 97 |
+
base_responses = responses.get(intent, [
|
| 98 |
+
"I can help you with that. Please provide more details.",
|
| 99 |
+
"Let me assist you with your request.",
|
| 100 |
+
"I'll help you find the information you need."
|
| 101 |
+
])
|
| 102 |
+
|
| 103 |
+
# For variety
|
| 104 |
+
if "flight" in text.lower():
|
| 105 |
+
base_responses.extend([
|
| 106 |
+
"I can help you book a flight. What are your travel dates?",
|
| 107 |
+
"Let me search for available flights for you.",
|
| 108 |
+
"I'll help you find the best flight options."
|
| 109 |
+
])
|
| 110 |
+
|
| 111 |
+
return random.choice(base_responses)
|
| 112 |
+
|
| 113 |
+
# Create training data with question-answer pairs
|
| 114 |
+
def create_training_pairs():
|
| 115 |
+
training_data = []
|
| 116 |
+
|
| 117 |
+
for item in raw_dataset:
|
| 118 |
+
question = item['text']
|
| 119 |
+
intent = item['intent']
|
| 120 |
+
response = create_response_for_intent(intent, question)
|
| 121 |
+
|
| 122 |
+
# Tokenize question and response
|
| 123 |
+
question_encoding = tokenizer.encode(question)
|
| 124 |
+
response_encoding = tokenizer.encode(response)
|
| 125 |
+
|
| 126 |
+
# Add the specially defined tokens
|
| 127 |
+
question_ids = [tokenizer.token_to_id("[CLS]")] + question_encoding.ids + [tokenizer.token_to_id("[SEP]")]
|
| 128 |
+
response_ids = [tokenizer.token_to_id("[CLS]")] + response_encoding.ids + [tokenizer.token_to_id("[SEP]")]
|
| 129 |
+
|
| 130 |
+
training_data.append({
|
| 131 |
+
'question_ids': question_ids,
|
| 132 |
+
'response_ids': response_ids,
|
| 133 |
+
'question_len': len(question_ids),
|
| 134 |
+
'response_len': len(response_ids)
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
return training_data
|
| 138 |
+
|
| 139 |
+
# Create custom dataset for training
|
| 140 |
+
class AtisGenerationDataset(Dataset):
|
| 141 |
+
def __init__(self, training_data, tokenizer, max_length=128):
|
| 142 |
+
self.training_data = training_data
|
| 143 |
+
self.tokenizer = tokenizer
|
| 144 |
+
self.max_length = max_length
|
| 145 |
+
|
| 146 |
+
def __len__(self):
|
| 147 |
+
return len(self.training_data)
|
| 148 |
+
|
| 149 |
+
def __getitem__(self, idx):
|
| 150 |
+
item = self.training_data[idx]
|
| 151 |
+
|
| 152 |
+
# Pad sequences
|
| 153 |
+
question_ids = item['question_ids'][:self.max_length//2]
|
| 154 |
+
response_ids = item['response_ids'][:self.max_length//2]
|
| 155 |
+
|
| 156 |
+
# Pad with PAD token
|
| 157 |
+
question_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(question_ids))
|
| 158 |
+
response_ids += [tokenizer.token_to_id("[PAD]")] * (self.max_length//2 - len(response_ids))
|
| 159 |
+
|
| 160 |
+
return (
|
| 161 |
+
torch.tensor(question_ids),
|
| 162 |
+
torch.tensor(response_ids),
|
| 163 |
+
torch.tensor(item['question_len']),
|
| 164 |
+
torch.tensor(item['response_len'])
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Create training data
|
| 168 |
+
print("Creating training data...")
|
| 169 |
+
training_data = create_training_pairs()
|
| 170 |
+
print(f"Created {len(training_data)} training pairs")
|
| 171 |
+
|
| 172 |
+
# Prepare DataLoader
|
| 173 |
+
atis_dataset = AtisGenerationDataset(training_data, tokenizer)
|
| 174 |
+
dataloader = DataLoader(atis_dataset, batch_size=16, shuffle=True)
|
| 175 |
+
|
| 176 |
+
# Prepare model with all the neccessary parameters
|
| 177 |
+
vocab_size = tokenizer.get_vocab_size()
|
| 178 |
+
model = TransformerChatbot(
|
| 179 |
+
vocab_size=vocab_size,
|
| 180 |
+
d_model=512,
|
| 181 |
+
num_heads=8,
|
| 182 |
+
d_ff=2048,
|
| 183 |
+
num_encoder_layers=6,
|
| 184 |
+
num_decoder_layers=6,
|
| 185 |
+
num_roles=2,
|
| 186 |
+
max_turns=16,
|
| 187 |
+
num_slots=len(set(item['intent'] for item in raw_dataset)),
|
| 188 |
+
dropout=0.1
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Using gpu - cuda for training
|
| 192 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 193 |
+
model.to(device)
|
| 194 |
+
|
| 195 |
+
# Training loop for generation
|
| 196 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
| 197 |
+
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))
|
| 198 |
+
|
| 199 |
+
print("Starting training...")
|
| 200 |
+
for epoch in range(10): # 10 epochs for fast training
|
| 201 |
+
model.train()
|
| 202 |
+
total_loss = 0
|
| 203 |
+
for batch_idx, (question_ids, response_ids, question_lens, response_lens) in enumerate(dataloader):
|
| 204 |
+
question_ids = question_ids.to(device)
|
| 205 |
+
response_ids = response_ids.to(device)
|
| 206 |
+
|
| 207 |
+
batch_size, seq_len = question_ids.shape
|
| 208 |
+
|
| 209 |
+
# Dummy roles and turns
|
| 210 |
+
roles = torch.zeros_like(question_ids)
|
| 211 |
+
turns = torch.zeros_like(question_ids)
|
| 212 |
+
# Forward pass
|
| 213 |
+
gen_logits, slot_logits = model(
|
| 214 |
+
question_ids, response_ids,
|
| 215 |
+
roles, roles,
|
| 216 |
+
turns, turns
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# Calculate loss for generation (teacher forcing)
|
| 220 |
+
target_ids = response_ids[:, 1:] # Remove [CLS] token
|
| 221 |
+
gen_logits = gen_logits[:, :-1, :] # Remove last position
|
| 222 |
+
# Flatten for loss calculation
|
| 223 |
+
gen_logits_flat = gen_logits.reshape(-1, vocab_size)
|
| 224 |
+
target_ids_flat = target_ids.reshape(-1)
|
| 225 |
+
loss = loss_fn(gen_logits_flat, target_ids_flat)
|
| 226 |
+
optimizer.zero_grad()
|
| 227 |
+
loss.backward()
|
| 228 |
+
optimizer.step()
|
| 229 |
+
total_loss += loss.item()
|
| 230 |
+
if batch_idx % 100 == 0:
|
| 231 |
+
print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
|
| 232 |
+
# Averaging the losses
|
| 233 |
+
avg_loss = total_loss / len(dataloader)
|
| 234 |
+
print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
|
| 235 |
+
|
| 236 |
+
# Save model
|
| 237 |
+
print("Saving model...")
|
| 238 |
+
torch.save(model.state_dict(), 'atis_transformer.pt')
|
| 239 |
+
print("Training completed!")
|
__pycache__/transformer_chat.cpython-312.pyc
ADDED
|
Binary file (6.46 kB). View file
|
|
|
atis_transformer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7972c60cfa6e05b74247ec320cd7f7b46871f8ee8315ea14d69cc20190b459f
|
| 3 |
+
size 412902527
|
built_transformer/__pycache__/decoders.cpython-312.pyc
ADDED
|
Binary file (4.75 kB). View file
|
|
|
built_transformer/__pycache__/embeddings.cpython-312.pyc
ADDED
|
Binary file (3.46 kB). View file
|
|
|
built_transformer/__pycache__/encoder.cpython-312.pyc
ADDED
|
Binary file (5.52 kB). View file
|
|
|
built_transformer/__pycache__/encoding_layers.cpython-312.pyc
ADDED
|
Binary file (4.33 kB). View file
|
|
|
built_transformer/__pycache__/masking_for_attention.cpython-312.pyc
ADDED
|
Binary file (885 Bytes). View file
|
|
|
built_transformer/__pycache__/multihead_attention.cpython-312.pyc
ADDED
|
Binary file (3.87 kB). View file
|
|
|
built_transformer/__pycache__/positional_encodings.cpython-312.pyc
ADDED
|
Binary file (3.15 kB). View file
|
|
|
built_transformer/__pycache__/slot_classifier.cpython-312.pyc
ADDED
|
Binary file (3.33 kB). View file
|
|
|
built_transformer/decoders.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import math
|
| 4 |
+
from .positional_encodings import PositionalEncoding
|
| 5 |
+
from .encoding_layers import position_wide_feed_forward
|
| 6 |
+
|
| 7 |
+
class DecoderLayer(nn.Module):
|
| 8 |
+
def __init__(self, dimension_for_model, num_of_heads, dim_feedforward=2048, dropout=0.1):
|
| 9 |
+
|
| 10 |
+
'''
|
| 11 |
+
dimension_for_model: the desired dimension of model as specified from the embeddings layer
|
| 12 |
+
num_of_heads: the desired number of heads wanted from the multi-head-attention mechanism, also specified within encoders
|
| 13 |
+
dim_feedforward: the dimension for the feedforward module, defaulted to 2048
|
| 14 |
+
dropout: mechanism to remove model dependencies on other factors, defaulted to 0.1
|
| 15 |
+
'''
|
| 16 |
+
|
| 17 |
+
super().__init__()
|
| 18 |
+
self.self_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Masked self - attention
|
| 19 |
+
self.cross_attn = nn.MultiheadAttention(dimension_for_model, num_of_heads, dropout=dropout) # Encoder decoder attention
|
| 20 |
+
self.ffn = nn.Sequential(
|
| 21 |
+
nn.Linear(dimension_for_model, dim_feedforward), # Feeding forward
|
| 22 |
+
nn.ReLU(),
|
| 23 |
+
nn.Linear(dim_feedforward, dimension_for_model),
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Layer normalizations
|
| 27 |
+
self.norm1 = nn.LayerNorm(dimension_for_model)
|
| 28 |
+
self.norm2 = nn.LayerNorm(dimension_for_model)
|
| 29 |
+
self.norm3 = nn.LayerNorm(dimension_for_model)
|
| 30 |
+
# Dropouts
|
| 31 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 32 |
+
self.dropout2 = nn.Dropout(dropout)
|
| 33 |
+
self.dropout3 = nn.Dropout(dropout)
|
| 34 |
+
|
| 35 |
+
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
|
| 36 |
+
tgt_t = tgt.transpose(0, 1)
|
| 37 |
+
memory_t = memory.transpose(0, 1)
|
| 38 |
+
|
| 39 |
+
# Masked self-attention
|
| 40 |
+
_tgt = tgt
|
| 41 |
+
tgt2, _ = self.self_attn(tgt_t, tgt_t, tgt_t, attn_mask=tgt_mask)
|
| 42 |
+
tgt2 = tgt2.transpose(0, 1) # Back to [batch_size, seq_len, hidden_dim]
|
| 43 |
+
tgt = self.norm1(_tgt + self.dropout1(tgt2))
|
| 44 |
+
|
| 45 |
+
# Cross-attention with encoder output
|
| 46 |
+
_tgt = tgt
|
| 47 |
+
tgt2, _ = self.cross_attn(tgt_t, memory_t, memory_t, attn_mask=memory_mask)
|
| 48 |
+
tgt2 = tgt2.transpose(0, 1) # Back to [batch_size, seq_len, hidden_dim]
|
| 49 |
+
tgt = self.norm2(_tgt + self.dropout2(tgt2))
|
| 50 |
+
|
| 51 |
+
# Feed-forward
|
| 52 |
+
_tgt = tgt
|
| 53 |
+
tgt2 = self.ffn(tgt)
|
| 54 |
+
tgt = self.norm3(_tgt + self.dropout3(tgt2))
|
| 55 |
+
|
| 56 |
+
return tgt
|
| 57 |
+
|
| 58 |
+
class Decoder(nn.Module):
|
| 59 |
+
def __init__(self, vocab_size, dimension_for_model, num_layers, num_of_heads, dim_feedforward=2048, dropout=0.1, max_len=5000):
|
| 60 |
+
super().__init__()
|
| 61 |
+
self.embed = nn.Embedding(vocab_size, dimension_for_model) # Embeds the data
|
| 62 |
+
self.pe = PositionalEncoding(dimension_for_model, dropout=dropout, max_len=max_len) # Encodes using sine and cosine functions for different positions
|
| 63 |
+
self.layers = nn.ModuleList([
|
| 64 |
+
DecoderLayer(dimension_for_model, num_of_heads, dim_feedforward, dropout)
|
| 65 |
+
for _ in range(num_layers)
|
| 66 |
+
])
|
| 67 |
+
self.norm = nn.LayerNorm(dimension_for_model)
|
| 68 |
+
|
| 69 |
+
def forward(self, tgt_seq, memory, tgt_mask=None, memory_mask=None):
|
| 70 |
+
x = self.embed(tgt_seq) * math.sqrt(self.embed.embedding_dim) # Embedding and masking
|
| 71 |
+
x = self.pe(x)
|
| 72 |
+
for layer in self.layers: # Iterating through encoding layers
|
| 73 |
+
x = layer(x, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
|
| 74 |
+
return self.norm(x) # Layer normalization
|
built_transformer/embeddings.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import math
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
# Including embeding layers for converting tokens into vector spaces
|
| 5 |
+
class Embeddings(nn.Module):
|
| 6 |
+
'''
|
| 7 |
+
The constructor for the embeddings class, initializing a look up table that corresponds each words in the vocabulary chain to a vector
|
| 8 |
+
char: the amount of unique characters passed in
|
| 9 |
+
dimension_for_model: the desired dimension of vector that's desired to pass the word to
|
| 10 |
+
num_of_roles: the number of roles passed in
|
| 11 |
+
'''
|
| 12 |
+
def __init__ (self, char, dimension_for_model, num_of_roles = 2, max_turns = 16):
|
| 13 |
+
# Initializing parent function
|
| 14 |
+
super(Embeddings, self).__init__()
|
| 15 |
+
#creating an embedding layer and parsing the words into the matrix and dimension corresponding to the input
|
| 16 |
+
self.lut = nn.Embedding(char, dimension_for_model) #stores data into look up table
|
| 17 |
+
self.lut_roles = nn.Embedding (num_of_roles, dimension_for_model) #creating look up table for the number of roles
|
| 18 |
+
self.lut_turns = nn.Embedding (max_turns, dimension_for_model) #creating look up table for the number of turns
|
| 19 |
+
self.dimension_for_model = dimension_for_model #stores variable
|
| 20 |
+
self.norm = nn.LayerNorm(dimension_for_model) #defining normalization methods
|
| 21 |
+
'''
|
| 22 |
+
looks up the corresponding number from the look up table when numbers are passed in
|
| 23 |
+
x: a tensor of token indices
|
| 24 |
+
'''
|
| 25 |
+
def forward(self, x, roles, turns):
|
| 26 |
+
var = self.lut(x) # Initialize the variable with the lookup table information of actual speaking content - parsed to words
|
| 27 |
+
var = var + self.lut(roles) # Adding information about roles into the tensor
|
| 28 |
+
var = var + self.lut(turns) # Adding information about speaking turn into the tensor
|
| 29 |
+
|
| 30 |
+
# Normalizing the tensors
|
| 31 |
+
var = var*math.sqrt(self.dimension_for_model)
|
| 32 |
+
var = self.norm(var)
|
| 33 |
+
return var
|
| 34 |
+
|
| 35 |
+
if __name__ == '__main__':
|
| 36 |
+
d_model = 512 # Desired model_dimension size definition
|
| 37 |
+
|
| 38 |
+
# Instead of scoping in words, move down a scope for characters, which is unarguably more beneficial
|
| 39 |
+
characters = list("abcdefghijklmnopqrstuvwxyz ")
|
| 40 |
+
|
| 41 |
+
# Create a mapping from each character to its index.
|
| 42 |
+
char2idx = {char: idx for idx, char in enumerate(characters)}
|
| 43 |
+
vocab = len(characters) # The vocabulary size is the number of unique characters
|
| 44 |
+
|
| 45 |
+
# Create a look-up table for each character(role/speaker) within the chat
|
| 46 |
+
look_up_table_roles = {'system': 0, 'user': 1}
|
| 47 |
+
|
| 48 |
+
# Example input string.
|
| 49 |
+
input_str = "01 system: hello world"
|
| 50 |
+
|
| 51 |
+
# Splitting the conversation, position and role information from a line
|
| 52 |
+
position = int(input_str[0:2].strip())
|
| 53 |
+
input_str = input_str[2:]
|
| 54 |
+
conversation = input_str.split(':')[1].strip()
|
| 55 |
+
role = input_str.split(':')[0].strip()
|
| 56 |
+
|
| 57 |
+
# Convert the input string into a list of indices
|
| 58 |
+
# This filters out any character not in the vocabulary
|
| 59 |
+
# Convert the roles into reference ids using the look up table
|
| 60 |
+
conversation_indices = [char2idx[char] for char in conversation if char in char2idx]
|
| 61 |
+
position_indices = [position for char in conversation if char in char2idx]
|
| 62 |
+
role_indices = [look_up_table_roles[role] for char in conversation if char in char2idx]
|
| 63 |
+
|
| 64 |
+
# Create tensors from the lists of indices.
|
| 65 |
+
# Here we treat it as a batch with one sequence.
|
| 66 |
+
conversations = torch.LongTensor([conversation_indices])
|
| 67 |
+
roles = torch.LongTensor([role_indices])
|
| 68 |
+
positions = torch.LongTensor([position_indices])
|
| 69 |
+
|
| 70 |
+
# Initialize the embedding layer using the character-level vocabulary size.
|
| 71 |
+
emb = Embeddings(vocab, d_model)
|
| 72 |
+
embr = emb(conversations, roles, positions)
|
| 73 |
+
|
| 74 |
+
print("embr:", embr)
|
| 75 |
+
|
built_transformer/encoder.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#integrating the neccessary classes
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import math
|
| 5 |
+
from .positional_encodings import PositionalEncoding #import other modules neccessary for
|
| 6 |
+
from .multihead_attention import MultiHeadAttention
|
| 7 |
+
from .encoding_layers import position_wide_feed_forward, Residual_layer
|
| 8 |
+
from .masking_for_attention import mask
|
| 9 |
+
from .embeddings import Embeddings
|
| 10 |
+
|
| 11 |
+
class EncoderLayer(nn.Module):
|
| 12 |
+
def __init__(self, dimension_for_model, num_of_heads, dim_feedforward, dropout = 0.1):
|
| 13 |
+
'''
|
| 14 |
+
dimension_for_model: the dimension desired for the model specified at the embeddings layer
|
| 15 |
+
num_of_heads: the number of heads for the multi-head-attention structure to keep track of
|
| 16 |
+
dim_feedforward: the dimension of the positional feed forward structure
|
| 17 |
+
dropout: structure for removing model dependencies during training, improving robustness
|
| 18 |
+
'''
|
| 19 |
+
super().__init__()
|
| 20 |
+
# Loading previously coded structures for multi-head attention
|
| 21 |
+
self.self_attn = MultiHeadAttention(dimension_for_model, num_of_heads, dropout)
|
| 22 |
+
self.norm1 = nn.LayerNorm(dimension_for_model)
|
| 23 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 24 |
+
# Loading previously coded structures for position_wide_feed_forward
|
| 25 |
+
self.ffn = position_wide_feed_forward(dimension_for_model, dim_feedforward, dropout)
|
| 26 |
+
self.norm2 = nn.LayerNorm(dimension_for_model)
|
| 27 |
+
self.dropout2 = nn.Dropout(dropout)
|
| 28 |
+
|
| 29 |
+
def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
|
| 30 |
+
# Self-attention block
|
| 31 |
+
_src = src
|
| 32 |
+
attn_output, _ = self.self_attn(src, src, src, mask=src_mask)
|
| 33 |
+
src = self.norm1(_src + self.dropout1(attn_output)) # changed attention output
|
| 34 |
+
# Feed-forward block
|
| 35 |
+
_src = src
|
| 36 |
+
ff_output = self.ffn(src)
|
| 37 |
+
src = self.norm2(_src + self.dropout2(ff_output))
|
| 38 |
+
return src
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class Encoder(nn.Module):
|
| 42 |
+
"""
|
| 43 |
+
Stacked Transformer encoder:
|
| 44 |
+
- embedding + positional encoding
|
| 45 |
+
- N encoder layers
|
| 46 |
+
- final layer norm
|
| 47 |
+
"""
|
| 48 |
+
def __init__(self, vocab_size, dimension_of_model, num_of_heads, num_layers, dim_feedforward = 2048, dropout = 0.1, max_len = 5000, num_of_roles=2, max_turns=16):
|
| 49 |
+
super().__init__()
|
| 50 |
+
# Token/role/turn embeddings
|
| 51 |
+
self.embed = Embeddings(vocab_size, dimension_for_model=dimension_of_model, num_of_roles=num_of_roles, max_turns=max_turns)
|
| 52 |
+
# Positional encodings (sinusoidal or learned)
|
| 53 |
+
self.pe = PositionalEncoding(dimension_of_model, dropout=dropout, max_len=max_len)
|
| 54 |
+
# Stacked encoder layers
|
| 55 |
+
self.layers = nn.ModuleList([
|
| 56 |
+
EncoderLayer(dimension_of_model, num_of_heads, dim_feedforward, dropout)
|
| 57 |
+
for _ in range(num_layers)
|
| 58 |
+
])
|
| 59 |
+
# Final normalization
|
| 60 |
+
self.norm = nn.LayerNorm(dimension_of_model)
|
| 61 |
+
|
| 62 |
+
def forward(self, src_ids, roles, turns, src_mask = None) -> torch.Tensor:
|
| 63 |
+
"""
|
| 64 |
+
Args:
|
| 65 |
+
src_ids: [batch_size x seq_len] input token indices
|
| 66 |
+
roles: [batch_size x seq_len] role ids
|
| 67 |
+
turns: [batch_size x seq_len] turn ids
|
| 68 |
+
src_mask: [batch_size, 1, 1, seq_len] mask to prevent attending to padding tokens
|
| 69 |
+
"""
|
| 70 |
+
# Embed tokens, roles, and turns
|
| 71 |
+
x = self.embed(src_ids, roles, turns)
|
| 72 |
+
# Add positional information
|
| 73 |
+
x = self.pe(x)
|
| 74 |
+
# Pass through each encoder layer
|
| 75 |
+
for layer in self.layers:
|
| 76 |
+
x = layer(x, src_mask)
|
| 77 |
+
# Final layer normalization
|
| 78 |
+
return self.norm(x)
|
| 79 |
+
|
| 80 |
+
def load_state_dict(self, state_dict, strict=True):
|
| 81 |
+
"""
|
| 82 |
+
Custom state dict loading to handle backward compatibility with old model format
|
| 83 |
+
"""
|
| 84 |
+
# Check if this is an old model format (has encoder.embed.weight)
|
| 85 |
+
if 'encoder.embed.weight' in state_dict:
|
| 86 |
+
# This is an old model, we need to adapt the weights
|
| 87 |
+
old_embed_weight = state_dict['encoder.embed.weight']
|
| 88 |
+
|
| 89 |
+
# Copy the old embedding weights to the new structure
|
| 90 |
+
state_dict['encoder.embed.lut.weight'] = old_embed_weight
|
| 91 |
+
state_dict['encoder.embed.lut_roles.weight'] = torch.zeros_like(old_embed_weight)
|
| 92 |
+
state_dict['encoder.embed.lut_turns.weight'] = torch.zeros_like(old_embed_weight)
|
| 93 |
+
state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
|
| 94 |
+
state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
|
| 95 |
+
|
| 96 |
+
# Remove the old key
|
| 97 |
+
del state_dict['encoder.embed.weight']
|
| 98 |
+
|
| 99 |
+
return super().load_state_dict(state_dict, strict=strict)
|
| 100 |
+
|
built_transformer/encoding_layers.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#including neccessary libraires
|
| 2 |
+
import math
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
#using the positional feed-forward network to expand dimension for model
|
| 7 |
+
class position_wide_feed_forward(nn.Module):
|
| 8 |
+
def __init__(self, dimension_for_model, dimension_for_network, dropout = 0.1):
|
| 9 |
+
|
| 10 |
+
'''
|
| 11 |
+
A Constructor for the positional feed forward network
|
| 12 |
+
dimensin_for_model: the manually decided dimension that's used in the embeddings layer
|
| 13 |
+
dimension_for_network: the dimension needed to expand the embedded results into
|
| 14 |
+
dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
|
| 15 |
+
'''
|
| 16 |
+
|
| 17 |
+
super().__init__() # Initializing the parent class - 'neural-networks'
|
| 18 |
+
self.expansion = nn.Linear(dimension_for_model, dimension_for_network) # Expanding the original batch taken from the multi-head attention into newer ones with the desired dimensions
|
| 19 |
+
self.apply_dropout = nn.Dropout(dropout) # Creating the dropout layer for improving the model's ability through testing and training by replacing specific rows and columns with 0s
|
| 20 |
+
self.activation = nn.ReLU() # Introducing non-linearity into the encoder and allowing models to represent values non-linearly
|
| 21 |
+
self.reverse_expansion = nn.Linear(dimension_for_network, dimension_for_model) # Reducing the dimension from expanded into original
|
| 22 |
+
def forward(self, x):
|
| 23 |
+
|
| 24 |
+
'''
|
| 25 |
+
Applying the process of the positional feed_forward function
|
| 26 |
+
x: the data which the positional feed forward is applied to
|
| 27 |
+
'''
|
| 28 |
+
parsed = self.apply_dropout(self.activation(self.expansion(x)))
|
| 29 |
+
return self.reverse_expansion(parsed)
|
| 30 |
+
# Using the layer_normalization to add outputs back and then normalize the layer
|
| 31 |
+
|
| 32 |
+
class Residual_layer (nn.Module):
|
| 33 |
+
def __init__(self, dimension_for_model, dropout = 0.1):
|
| 34 |
+
'''
|
| 35 |
+
A Constructor for the Residual and Normalization Layer
|
| 36 |
+
dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training
|
| 37 |
+
dimension_for_model: The desired dimension from the embeddings layer
|
| 38 |
+
'''
|
| 39 |
+
super().__init__()
|
| 40 |
+
self.normalize = nn.LayerNorm(dimension_for_model) # Creating the layer normalization
|
| 41 |
+
self.apply_dropout = nn.Dropout(dropout)
|
| 42 |
+
def forward(self, input_tensor, sublayer_tensor):
|
| 43 |
+
'''
|
| 44 |
+
input_tensor: the collection of tensor sum at the current stage
|
| 45 |
+
sublayer_tensor: the tensor from the specific sublayer and still needed to be added
|
| 46 |
+
'''
|
| 47 |
+
result = self.apply_dropout(sublayer_tensor)+input_tensor # Adding two results together, since both are of same dimension to enforce the positional arguments, but also apply dropout to the new tensor being added
|
| 48 |
+
return self.normalize(result) # Return the normalized result
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
if __name__ == '__main__':
|
| 53 |
+
inp = torch.tensor([[
|
| 54 |
+
[1.0, 2.0, 3.0, 4.0],
|
| 55 |
+
[0.5, 1.5, 2.5, 3.5],
|
| 56 |
+
[4.0, 3.0, 2.0, 1.0]
|
| 57 |
+
]])
|
| 58 |
+
|
| 59 |
+
# Instantiate with no dropout
|
| 60 |
+
ffn = position_wide_feed_forward(dimension_for_model=4, dimension_for_network=8, dropout=0.0)
|
| 61 |
+
|
| 62 |
+
# Run it
|
| 63 |
+
out = ffn(inp)
|
| 64 |
+
|
| 65 |
+
# Print to verify shape and nontrivial transform
|
| 66 |
+
print("Input:", inp)
|
| 67 |
+
print("Output:", out)
|
| 68 |
+
print("Output shape:", out.shape)
|
| 69 |
+
x = torch.tensor([[[1.0, 2.0, 3.0, 4.0],
|
| 70 |
+
[4.0, 3.0, 2.0, 1.0],
|
| 71 |
+
[0.5, 1.5, 2.5, 3.5]]])
|
| 72 |
+
|
| 73 |
+
# Dummy “sublayer” output to add
|
| 74 |
+
sub = torch.tensor([[[0.1, 0.1, 0.1, 0.1],
|
| 75 |
+
[0.2, 0.2, 0.2, 0.2],
|
| 76 |
+
[0.3, 0.3, 0.3, 0.3]]])
|
| 77 |
+
|
| 78 |
+
# Instantiate your residual+norm block (no dropout)
|
| 79 |
+
layer = Residual_layer(dimension_for_model=4, dropout=0.0)
|
| 80 |
+
|
| 81 |
+
# Run
|
| 82 |
+
out = layer(x, sub)
|
| 83 |
+
|
| 84 |
+
# Print everything
|
| 85 |
+
print("Input X:\n", x)
|
| 86 |
+
print("\nSublayer output:\n", sub)
|
| 87 |
+
print("\nResidual+Norm output:\n", out)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
built_transformer/masking_for_attention.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
def mask(size):
|
| 5 |
+
'''
|
| 6 |
+
A function for creating a look-ahead mask, ensuring that tokens won't see future tokens during the process of training
|
| 7 |
+
through the creation of upper-triangular matrixes
|
| 8 |
+
size: number of tokens within the sequence
|
| 9 |
+
'''
|
| 10 |
+
sq_mat = (1, size, size) # Creating a square matrix filled with 1
|
| 11 |
+
mask = np.triu(np.ones(sq_mat), k=1).astype('uint8') # Turning the square matrix into an upper triangular matrix
|
| 12 |
+
return torch.from_numpy(1 - mask)
|
built_transformer/multihead_attention.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
class MultiHeadAttention(nn.Module):
|
| 6 |
+
def __init__(self, dimension_for_model, num_of_heads, dropout = 0.1):
|
| 7 |
+
'''
|
| 8 |
+
initializes multi-head attention module
|
| 9 |
+
dimension_for_model: the same variable as the one in the embeddings, meaning the dimensionality of the embeddings
|
| 10 |
+
num_heads: the number of attention heads
|
| 11 |
+
dropout: as explained in positional_encodings, the dropout rate, defaulted to 0.1
|
| 12 |
+
'''
|
| 13 |
+
|
| 14 |
+
# Initializing the parent function
|
| 15 |
+
super(MultiHeadAttention, self).__init__()
|
| 16 |
+
assert dimension_for_model % num_of_heads == 0, "dimension_for_model must be devisible by num_of_heads"
|
| 17 |
+
|
| 18 |
+
self.num_of_heads = num_of_heads
|
| 19 |
+
self.dimension_for_model = dimension_for_model
|
| 20 |
+
self.d_k = dimension_for_model//num_of_heads # This is the dimension for each head
|
| 21 |
+
|
| 22 |
+
# Creating linear layers for seperating data into query, key, and value
|
| 23 |
+
self.linear_query = nn.Linear(dimension_for_model, dimension_for_model)
|
| 24 |
+
self.linear_key = nn.Linear(dimension_for_model, dimension_for_model)
|
| 25 |
+
self.linear_value = nn.Linear(dimension_for_model, dimension_for_model)
|
| 26 |
+
self.linear_out = nn.Linear(dimension_for_model, dimension_for_model) # Added linear out
|
| 27 |
+
|
| 28 |
+
# Adding dropout layer
|
| 29 |
+
self.dropout = nn.Dropout(dropout)
|
| 30 |
+
# Defining and applying softmax
|
| 31 |
+
self.softmax = nn.Softmax(dim = -1)
|
| 32 |
+
|
| 33 |
+
def forward(self, query, key, value, mask = None):
|
| 34 |
+
'''
|
| 35 |
+
Forward pass for multi-head attention.
|
| 36 |
+
query: tensors with shape batch_size, sequence_length, dimension_for_model
|
| 37 |
+
key: same as query
|
| 38 |
+
value: same as query and key
|
| 39 |
+
mask: a tensor that can be applied to attention scores
|
| 40 |
+
'''
|
| 41 |
+
batch_size = query.size(0)
|
| 42 |
+
seq_len = query.size(1)
|
| 43 |
+
|
| 44 |
+
# Projecting using linear layers
|
| 45 |
+
Q = self.linear_query(query)
|
| 46 |
+
K = self.linear_key(key)
|
| 47 |
+
V = self.linear_value(value)
|
| 48 |
+
|
| 49 |
+
# Splitting tensors into multiple heads
|
| 50 |
+
Q = Q.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
|
| 51 |
+
K = K.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
|
| 52 |
+
V = V.view(batch_size, seq_len, self.num_of_heads, self.d_k).transpose(1,2)
|
| 53 |
+
# Applying the attention calculation formula
|
| 54 |
+
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
|
| 55 |
+
|
| 56 |
+
# Apply mask if provided
|
| 57 |
+
if mask is not None:
|
| 58 |
+
scores = scores.masked_fill(mask == 0, -1e9)
|
| 59 |
+
|
| 60 |
+
# Attention weight conversion
|
| 61 |
+
attn = self.softmax(scores)
|
| 62 |
+
attn = self.dropout(attn)
|
| 63 |
+
|
| 64 |
+
output = torch.matmul(attn, V)
|
| 65 |
+
|
| 66 |
+
# Concatonating outputs for all heads
|
| 67 |
+
output = output.transpose(1,2).contiguous().view(batch_size, seq_len, self.dimension_for_model)
|
| 68 |
+
|
| 69 |
+
# Linear projection to combine all heads
|
| 70 |
+
output = self.linear_out(output)
|
| 71 |
+
|
| 72 |
+
return output, attn
|
built_transformer/positional_encodings.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import math
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
|
| 5 |
+
# Positional Encoding Layer
|
| 6 |
+
class PositionalEncoding(nn.Module):
|
| 7 |
+
"""
|
| 8 |
+
Positional encoding layer for transformer models, with adjustments for lemmatization.
|
| 9 |
+
In lemmatization tasks (especially with character-level inputs), sequences are typically
|
| 10 |
+
much shorter. Therefore, max_len is set to a lower value to better match the expected input.
|
| 11 |
+
|
| 12 |
+
Parameters:
|
| 13 |
+
- dimension_for_model: Dimensionality of the embedding vectors.
|
| 14 |
+
- dropout: Dropout probability used for regularization.
|
| 15 |
+
- max_len: Maximum sequence length; lowered here (e.g., 256) since lemmatization sequences are short.
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, dimension_for_model, dropout, max_len=256):
|
| 18 |
+
# Initialize the parent module.
|
| 19 |
+
super(PositionalEncoding, self).__init__()
|
| 20 |
+
# Create a dropout layer.
|
| 21 |
+
self.dropout = nn.Dropout(p=dropout)
|
| 22 |
+
|
| 23 |
+
# Initialize a positional encoding matrix with shape (max_len, dimension_for_model).
|
| 24 |
+
pos_enc_matrix = torch.zeros(max_len, dimension_for_model)
|
| 25 |
+
|
| 26 |
+
# Create a column vector of positions: 0, 1, 2, ..., max_len-1.
|
| 27 |
+
position = torch.arange(0, max_len).unsqueeze(1)
|
| 28 |
+
|
| 29 |
+
# Calculate scaling terms for sine and cosine functions.
|
| 30 |
+
div_term = torch.exp(torch.arange(0, dimension_for_model, 2) * -(math.log(10000.0) / dimension_for_model))
|
| 31 |
+
|
| 32 |
+
# For even indices in the embedding dimensions, apply sine.
|
| 33 |
+
pos_enc_matrix[:, 0::2] = torch.sin(position * div_term)
|
| 34 |
+
# For odd indices, apply cosine.
|
| 35 |
+
pos_enc_matrix[:, 1::2] = torch.cos(position * div_term)
|
| 36 |
+
|
| 37 |
+
# Add an extra batch dimension for easier addition to input embeddings.
|
| 38 |
+
pos_enc_matrix = pos_enc_matrix.unsqueeze(0)
|
| 39 |
+
|
| 40 |
+
# Register the positional encoding matrix as a buffer so it's not updated by the optimizer.
|
| 41 |
+
self.register_buffer('pe', pos_enc_matrix)
|
| 42 |
+
|
| 43 |
+
def forward(self, x):
|
| 44 |
+
"""
|
| 45 |
+
Add positional encodings to the input tensor.
|
| 46 |
+
x: Tensor of shape [batch_size, sequence_length, dimension_for_model]
|
| 47 |
+
"""
|
| 48 |
+
# Add the positional encodings to the input (slice to match the input sequence length)
|
| 49 |
+
x = x + self.pe[:, :x.size(1)].detach()
|
| 50 |
+
return self.dropout(x)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Example usage for a lemmatization task
|
| 54 |
+
if __name__ == '__main__':
|
| 55 |
+
d_model = 512 # Embedding dimension.
|
| 56 |
+
dropout_rate = 0.1 # Dropout probability.
|
| 57 |
+
max_len = 256 # Adjusted maximum sequence length for short lemmatization inputs.
|
| 58 |
+
|
| 59 |
+
# Instantiate the positional encoding layer with a smaller max_len suitable for lemmatization.
|
| 60 |
+
pos_encoder = PositionalEncoding(dimension_for_model=d_model, dropout=dropout_rate, max_len=max_len)
|
| 61 |
+
|
| 62 |
+
# Create a dummy input: a batch of 2 sequences with length 20 (for example, character-level tokens).
|
| 63 |
+
dummy_input = torch.randn(2, 20, d_model)
|
| 64 |
+
|
| 65 |
+
# Apply the positional encoder.
|
| 66 |
+
encoded_output = pos_encoder(dummy_input)
|
| 67 |
+
|
| 68 |
+
print("Encoded output shape:", encoded_output.shape)
|
built_transformer/slot_classifier.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
class SlotClassifier(nn.Module):
|
| 6 |
+
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
input_dim: int,
|
| 10 |
+
num_slots: int,
|
| 11 |
+
hidden_dim: int = 256,
|
| 12 |
+
dropout: float = 0.1,
|
| 13 |
+
num_layers: int = 2
|
| 14 |
+
):
|
| 15 |
+
"""
|
| 16 |
+
Initialize the slot classifier.
|
| 17 |
+
input_dim: Dimension of the input features (usually dimension_of_model or d_model from transformer)
|
| 18 |
+
num_slots: Number of different slot types to classify
|
| 19 |
+
hidden_dim: Dimension of hidden layers in the MLP
|
| 20 |
+
dropout: Dropout probability for regularization
|
| 21 |
+
num_layers: Number of hidden layers in the MLP
|
| 22 |
+
"""
|
| 23 |
+
super().__init__()
|
| 24 |
+
|
| 25 |
+
# Build MLP layers
|
| 26 |
+
layers = []
|
| 27 |
+
prev_dim = input_dim
|
| 28 |
+
|
| 29 |
+
# Add hidden layers
|
| 30 |
+
for _ in range(num_layers - 1):
|
| 31 |
+
layers.extend([
|
| 32 |
+
nn.Linear(prev_dim, hidden_dim),
|
| 33 |
+
nn.LayerNorm(hidden_dim),
|
| 34 |
+
nn.ReLU(),
|
| 35 |
+
nn.Dropout(dropout)
|
| 36 |
+
])
|
| 37 |
+
prev_dim = hidden_dim
|
| 38 |
+
|
| 39 |
+
# Add final classification layer
|
| 40 |
+
layers.append(nn.Linear(prev_dim, num_slots))
|
| 41 |
+
|
| 42 |
+
self.mlp = nn.Sequential(*layers)
|
| 43 |
+
|
| 44 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 45 |
+
"""
|
| 46 |
+
Forward pass of the slot classifier.
|
| 47 |
+
x: Input tensor of shape [batch_size, input_dim]
|
| 48 |
+
Usually the [CLS] token representation from the transformer
|
| 49 |
+
"""
|
| 50 |
+
logits = self.mlp(x)
|
| 51 |
+
return logits
|
| 52 |
+
|
| 53 |
+
def predict(self, x: torch.Tensor) -> torch.Tensor:
|
| 54 |
+
"""
|
| 55 |
+
Get predictions from the classifier.
|
| 56 |
+
x: Input tensor of shape [batch_size, input_dim]
|
| 57 |
+
"""
|
| 58 |
+
logits = self.forward(x)
|
| 59 |
+
return torch.argmax(logits, dim=-1)
|
| 60 |
+
|
| 61 |
+
def get_probabilities(self, x: torch.Tensor) -> torch.Tensor:
|
| 62 |
+
"""
|
| 63 |
+
Get probability distribution over slots.
|
| 64 |
+
x: Input tensor of shape [batch_size, input_dim]
|
| 65 |
+
"""
|
| 66 |
+
logits = self.forward(x)
|
| 67 |
+
return F.softmax(logits, dim=-1)
|
built_transformer/test_linear_function.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
#simple test for the linear function to determine it's properties
|
| 4 |
+
m = nn.Linear(20, 30)
|
| 5 |
+
input = torch.randn(128, 20)
|
| 6 |
+
output = m(input)
|
| 7 |
+
print(output.size())
|
| 8 |
+
print(output)
|
non-app.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from huggingface_hub import InferenceClient
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
| 6 |
+
"""
|
| 7 |
+
client = InferenceClient("literallybannedfromcallingbob/Aegis-1B-Agent")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def respond(
|
| 11 |
+
message,
|
| 12 |
+
history: list[tuple[str, str]],
|
| 13 |
+
system_message,
|
| 14 |
+
max_tokens,
|
| 15 |
+
temperature,
|
| 16 |
+
top_p,
|
| 17 |
+
):
|
| 18 |
+
# Build prompt with history and system message
|
| 19 |
+
prompt = f"{system_message}\n"
|
| 20 |
+
for user, assistant in history:
|
| 21 |
+
if user:
|
| 22 |
+
prompt += f"User: {user}\n"
|
| 23 |
+
if assistant:
|
| 24 |
+
prompt += f"Assistant: {assistant}\n"
|
| 25 |
+
prompt += f"User: {message}\nAssistant:"
|
| 26 |
+
|
| 27 |
+
# Call the text_generation endpoint
|
| 28 |
+
response = client.text_generation(
|
| 29 |
+
prompt,
|
| 30 |
+
max_new_tokens=max_tokens,
|
| 31 |
+
temperature=temperature,
|
| 32 |
+
top_p=top_p,
|
| 33 |
+
stream=True,
|
| 34 |
+
)
|
| 35 |
+
output = ""
|
| 36 |
+
for r in response:
|
| 37 |
+
output += r.token.text
|
| 38 |
+
yield output
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
"""
|
| 42 |
+
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
| 43 |
+
"""
|
| 44 |
+
demo = gr.ChatInterface(
|
| 45 |
+
respond,
|
| 46 |
+
additional_inputs=[
|
| 47 |
+
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
| 48 |
+
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
| 49 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 50 |
+
gr.Slider(
|
| 51 |
+
minimum=0.1,
|
| 52 |
+
maximum=1.0,
|
| 53 |
+
value=0.95,
|
| 54 |
+
step=0.05,
|
| 55 |
+
label="Top-p (nucleus sampling)",
|
| 56 |
+
),
|
| 57 |
+
],
|
| 58 |
+
title="Transformer Chatbot Demo (currently trained with ATIS dataset)",
|
| 59 |
+
description="Ask flight-related questions and get an answer."
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1,5 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
transformers
|
| 4 |
-
tokenizers
|
| 5 |
-
datasets
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
huggingface_hub
|
|
|
|
|
|
|
|
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
transformer_chat.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
# Import neccessary layers
|
| 6 |
+
from built_transformer.embeddings import Embeddings
|
| 7 |
+
from built_transformer.encoder import Encoder, EncoderLayer
|
| 8 |
+
from built_transformer.decoders import Decoder, DecoderLayer
|
| 9 |
+
from built_transformer.positional_encodings import PositionalEncoding
|
| 10 |
+
from built_transformer.slot_classifier import SlotClassifier
|
| 11 |
+
|
| 12 |
+
class TransformerChatbot(nn.Module):
|
| 13 |
+
"""
|
| 14 |
+
Unified Transformer-based chatbot model that combines:
|
| 15 |
+
- Joint token/role/turn embeddings
|
| 16 |
+
- Encoder-decoder architecture with attention
|
| 17 |
+
- Slot-filling classification
|
| 18 |
+
- Generation capabilities
|
| 19 |
+
"""
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
vocab_size: int,
|
| 23 |
+
d_model: int = 512,
|
| 24 |
+
num_heads: int = 8,
|
| 25 |
+
d_ff: int = 2048,
|
| 26 |
+
num_encoder_layers: int = 6,
|
| 27 |
+
num_decoder_layers: int = 6,
|
| 28 |
+
num_roles: int = 2,
|
| 29 |
+
max_turns: int = 16,
|
| 30 |
+
num_slots: int = 4,
|
| 31 |
+
dropout: float = 0.1,
|
| 32 |
+
max_len: int = 5000
|
| 33 |
+
):
|
| 34 |
+
super().__init__()
|
| 35 |
+
|
| 36 |
+
# Embeddings for tokens, roles, and turns
|
| 37 |
+
self.embed = Embeddings(
|
| 38 |
+
char=vocab_size, # Fixed type and name mismatch
|
| 39 |
+
dimension_for_model=d_model,
|
| 40 |
+
num_of_roles=num_roles,
|
| 41 |
+
max_turns=max_turns
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Positional encoding
|
| 45 |
+
self.pos_enc = PositionalEncoding(d_model, dropout, max_len)
|
| 46 |
+
|
| 47 |
+
# Encoder stack
|
| 48 |
+
self.encoder = Encoder(
|
| 49 |
+
vocab_size=vocab_size,
|
| 50 |
+
dimension_of_model=d_model,
|
| 51 |
+
num_of_heads=num_heads,
|
| 52 |
+
num_layers=num_encoder_layers,
|
| 53 |
+
dim_feedforward=d_ff,
|
| 54 |
+
dropout=dropout,
|
| 55 |
+
max_len=max_len,
|
| 56 |
+
num_of_roles=num_roles,
|
| 57 |
+
max_turns=max_turns
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Decoder stack
|
| 61 |
+
self.decoder = Decoder(
|
| 62 |
+
vocab_size=vocab_size,
|
| 63 |
+
dimension_for_model=d_model,
|
| 64 |
+
num_layers=num_decoder_layers,
|
| 65 |
+
num_of_heads=num_heads,
|
| 66 |
+
dim_feedforward=d_ff,
|
| 67 |
+
dropout=dropout,
|
| 68 |
+
max_len=max_len
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Output projections
|
| 72 |
+
self.out_proj = nn.Linear(d_model, vocab_size)
|
| 73 |
+
self.slot_classifier = SlotClassifier(d_model, num_slots)
|
| 74 |
+
|
| 75 |
+
# Initialize parameters
|
| 76 |
+
self._init_parameters()
|
| 77 |
+
|
| 78 |
+
def _init_parameters(self):
|
| 79 |
+
#Initialize parameters with Xavier uniform initialization
|
| 80 |
+
for p in self.parameters():
|
| 81 |
+
if p.dim() > 1:
|
| 82 |
+
nn.init.xavier_uniform_(p)
|
| 83 |
+
|
| 84 |
+
def load_state_dict(self, state_dict, strict=True):
|
| 85 |
+
# Check if this is an old model format (has encoder.embed.weight), since previous versions uses different weights
|
| 86 |
+
if 'encoder.embed.weight' in state_dict:
|
| 87 |
+
# This is an old model, we need to adapt the weights
|
| 88 |
+
old_embed_weight = state_dict['encoder.embed.weight']
|
| 89 |
+
|
| 90 |
+
# Copy the old embedding weights to the new structure
|
| 91 |
+
state_dict['encoder.embed.lut.weight'] = old_embed_weight
|
| 92 |
+
# Initialize role and turn embeddings with correct sizes
|
| 93 |
+
state_dict['encoder.embed.lut_roles.weight'] = torch.zeros(2, old_embed_weight.size(1)) # 2 roles
|
| 94 |
+
state_dict['encoder.embed.lut_turns.weight'] = torch.zeros(16, old_embed_weight.size(1)) # 16 turns
|
| 95 |
+
state_dict['encoder.embed.norm.weight'] = torch.ones(old_embed_weight.size(1))
|
| 96 |
+
state_dict['encoder.embed.norm.bias'] = torch.zeros(old_embed_weight.size(1))
|
| 97 |
+
|
| 98 |
+
# Remove the old key
|
| 99 |
+
del state_dict['encoder.embed.weight']
|
| 100 |
+
|
| 101 |
+
return super().load_state_dict(state_dict, strict=strict)
|
| 102 |
+
|
| 103 |
+
def encode(self, src_tokens, src_roles, src_turns, src_mask=None):
|
| 104 |
+
"""
|
| 105 |
+
Encode source sequences with role and turn information.
|
| 106 |
+
Args:
|
| 107 |
+
src_tokens: [B, S] token IDs
|
| 108 |
+
src_roles: [B, S] role IDs
|
| 109 |
+
src_turns: [B, S] turn IDs
|
| 110 |
+
src_mask: padding mask [B, 1, 1, S]
|
| 111 |
+
Returns:
|
| 112 |
+
enc_out: [B, S, d_model]
|
| 113 |
+
"""
|
| 114 |
+
# Pass through encoder (embedding and positional encoding handled inside)
|
| 115 |
+
return self.encoder(src_tokens, src_roles, src_turns, src_mask)
|
| 116 |
+
|
| 117 |
+
def decode(
|
| 118 |
+
self,
|
| 119 |
+
tgt_tokens,
|
| 120 |
+
enc_out,
|
| 121 |
+
tgt_roles,
|
| 122 |
+
tgt_turns,
|
| 123 |
+
src_mask=None,
|
| 124 |
+
tgt_mask=None
|
| 125 |
+
):
|
| 126 |
+
"""
|
| 127 |
+
Decode target sequences with encoder context.
|
| 128 |
+
Args:
|
| 129 |
+
tgt_tokens: [B, T] target token IDs
|
| 130 |
+
enc_out: [B, S, d_model] encoder output
|
| 131 |
+
tgt_roles: [B, T] target role IDs
|
| 132 |
+
tgt_turns: [B, T] target turn IDs
|
| 133 |
+
src_mask: [B, 1, 1, S] source mask
|
| 134 |
+
tgt_mask: [B, 1, T, T] target mask
|
| 135 |
+
Returns:
|
| 136 |
+
logits: [B, T, vocab_size]
|
| 137 |
+
"""
|
| 138 |
+
# Combine embeddings
|
| 139 |
+
y = self.embed(tgt_tokens, tgt_roles, tgt_turns)
|
| 140 |
+
y = self.pos_enc(y)
|
| 141 |
+
|
| 142 |
+
# Pass through decoder
|
| 143 |
+
dec_out = self.decoder(tgt_tokens, enc_out, tgt_mask, src_mask)
|
| 144 |
+
return self.out_proj(dec_out)
|
| 145 |
+
|
| 146 |
+
def forward(
|
| 147 |
+
self,
|
| 148 |
+
src_tokens,
|
| 149 |
+
tgt_tokens,
|
| 150 |
+
src_roles,
|
| 151 |
+
tgt_roles,
|
| 152 |
+
src_turns,
|
| 153 |
+
tgt_turns,
|
| 154 |
+
src_mask=None,
|
| 155 |
+
tgt_mask=None
|
| 156 |
+
):
|
| 157 |
+
"""
|
| 158 |
+
Full forward pass combining encoding, decoding, and slot classification.
|
| 159 |
+
Args:
|
| 160 |
+
src_tokens: [B, S] source token IDs
|
| 161 |
+
tgt_tokens: [B, T] target token IDs
|
| 162 |
+
src_roles: [B, S] source role IDs
|
| 163 |
+
tgt_roles: [B, T] target role IDs
|
| 164 |
+
src_turns: [B, S] source turn IDs
|
| 165 |
+
tgt_turns: [B, T] target turn IDs
|
| 166 |
+
src_mask: [B, 1, 1, S] source mask
|
| 167 |
+
tgt_mask: [B, 1, T, T] target mask
|
| 168 |
+
Returns:
|
| 169 |
+
gen_logits: [B, T, vocab_size] generation logits
|
| 170 |
+
slot_logits: [B, num_slots] slot classification logits
|
| 171 |
+
"""
|
| 172 |
+
# Encode source sequence
|
| 173 |
+
enc_out = self.encode(src_tokens, src_roles, src_turns, src_mask)
|
| 174 |
+
|
| 175 |
+
# Decode target sequence
|
| 176 |
+
gen_logits = self.decode(
|
| 177 |
+
tgt_tokens,
|
| 178 |
+
enc_out,
|
| 179 |
+
tgt_roles,
|
| 180 |
+
tgt_turns,
|
| 181 |
+
src_mask,
|
| 182 |
+
tgt_mask
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# Use first position of encoder output for slot classification
|
| 186 |
+
cls_rep = enc_out[:, 0, :]
|
| 187 |
+
slot_logits = self.slot_classifier(cls_rep)
|
| 188 |
+
|
| 189 |
+
return gen_logits, slot_logits
|