@sooftware Can you kindly explain to me why the output lengths and targets are so different? :/ (also in outputs I get negative floats). Example shown below
The outputs are of shape [32,490,16121] (where 16121 is the len of my vocab) What is the 490 dimensions
Also the outputs are probabilities right?
(outputs)
tensor([[[-9.7001, -9.6490, -9.6463,  ..., -9.6936, -9.6430, -9.7431],
         [-9.6997, -9.6487, -9.6470,  ..., -9.6903, -9.6450, -9.7416],
         [-9.6999, -9.6477, -9.6479,  ..., -9.6898, -9.6453, -9.7417],
         ...,
         [-9.7006, -9.6449, -9.6513,  ..., -9.6889, -9.6477, -9.7405],
         [-9.7003, -9.6448, -9.6512,  ..., -9.6893, -9.6477, -9.7410],
         [-9.7007, -9.6453, -9.6513,  ..., -9.6892, -9.6466, -9.7403]],
        [[-9.6844, -9.6316, -9.6387,  ..., -9.6880, -9.6269, -9.7657],
         [-9.6834, -9.6299, -9.6404,  ..., -9.6872, -9.6283, -9.7642],
         [-9.6834, -9.6334, -9.6387,  ..., -9.6864, -9.6290, -9.7616],
         ...,
         [-9.6840, -9.6299, -9.6431,  ..., -9.6830, -9.6304, -9.7608],
         [-9.6838, -9.6297, -9.6428,  ..., -9.6834, -9.6303, -9.7609],
         [-9.6842, -9.6300, -9.6428,  ..., -9.6837, -9.6292, -9.7599]],
        [[-9.6966, -9.6386, -9.6458,  ..., -9.6896, -9.6375, -9.7521],
         [-9.6974, -9.6374, -9.6462,  ..., -9.6890, -9.6369, -9.7516],
         [-9.6974, -9.6405, -9.6456,  ..., -9.6876, -9.6378, -9.7491],
         ...,
         [-9.6978, -9.6336, -9.6493,  ..., -9.6851, -9.6419, -9.7490],
         [-9.6971, -9.6334, -9.6487,  ..., -9.6863, -9.6411, -9.7501],
         [-9.6972, -9.6338, -9.6489,  ..., -9.6867, -9.6396, -9.7497]],
        ...,
        [[-9.7005, -9.6249, -9.6588,  ..., -9.6762, -9.6557, -9.7555],
         [-9.7028, -9.6266, -9.6597,  ..., -9.6765, -9.6574, -9.7542],
         [-9.7016, -9.6240, -9.6605,  ..., -9.6761, -9.6576, -9.7553],
         ...,
         [-9.7036, -9.6237, -9.6624,  ..., -9.6728, -9.6590, -9.7524],
         [-9.7034, -9.6235, -9.6620,  ..., -9.6735, -9.6589, -9.7530],
         [-9.7038, -9.6240, -9.6622,  ..., -9.6738, -9.6582, -9.7524]],
        [[-9.7058, -9.6305, -9.6566,  ..., -9.6739, -9.6557, -9.7466],
         [-9.7061, -9.6273, -9.6569,  ..., -9.6774, -9.6564, -9.7499],
         [-9.7046, -9.6280, -9.6576,  ..., -9.6772, -9.6575, -9.7498],
         ...,
         [-9.7060, -9.6263, -9.6609,  ..., -9.6714, -9.6561, -9.7461],
         [-9.7055, -9.6262, -9.6605,  ..., -9.6723, -9.6558, -9.7469],
         [-9.7058, -9.6270, -9.6606,  ..., -9.6725, -9.6552, -9.7460]],
        [[-9.7101, -9.6312, -9.6570,  ..., -9.6736, -9.6551, -9.7420],
         [-9.7102, -9.6307, -9.6579,  ..., -9.6733, -9.6576, -9.7418],
         [-9.7078, -9.6281, -9.6598,  ..., -9.6704, -9.6596, -9.7418],
         ...,
         [-9.7084, -9.6288, -9.6605,  ..., -9.6706, -9.6588, -9.7399],
         [-9.7081, -9.6286, -9.6600,  ..., -9.6714, -9.6584, -9.7406],
         [-9.7085, -9.6291, -9.6601,  ..., -9.6717, -9.6577, -9.7398]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
(output_lengths)
tensor([312, 260, 315, 320, 317, 275, 308, 291, 272, 300, 262, 227, 303, 252,
        298, 256, 303, 251, 284, 259, 263, 286, 209, 262, 166, 194, 149, 212,
        121, 114, 110,  57], device='cuda:0', dtype=torch.int32)
(target_lengths)
tensor([57, 55, 54, 50, 49, 49, 49, 48, 48, 47, 43, 42, 41, 40, 40, 39, 37, 37,
        36, 36, 36, 35, 34, 33, 29, 27, 26, 24, 20, 19, 17,  9])
I am using the following code for training and evaluation
import torch
import time
import sys
from google.colab import output
import torch.nn as nn
from conformer import Conformer
import torchmetrics
import random
cuda = torch.cuda.is_available()  
device = torch.device('cuda' if cuda else 'cpu')
print('Device:', device)
################################################################################
def train_model(model, optimizer, criterion, loader, metric):
  running_loss = 0.0
  for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
    # with output.use_tags('some_outputs'):
    #   sys.stdout.write('Batch: '+ str(i+1)+'/290')
    #   sys.stdout.flush();
    #sorting inputs and targets to have targets in descending order based on len
    sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
    sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
    sorted_audio_len=torch.zeros(32,dtype=torch.int)
    sorted_translations=torch.zeros((32,78),dtype=torch.int)
    sorted_translation_len=sorted_list
    for index, contentof in enumerate(translation_len):
      sorted_audio[index]=audio[sorted_indices[index]]
      sorted_audio_len[index]=audio_len[sorted_indices[index]]
      sorted_translations[index]=translations[sorted_indices[index]]
    #transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
    inputs=sorted_audio.to(device)
    inputs=torch.transpose(inputs, 1, 2)
    input_lengths=sorted_audio_len
    targets=sorted_translations.to(device)
    target_lengths=sorted_translation_len
    optimizer.zero_grad()
  
    # Forward propagate
    outputs, output_lengths = model(inputs, input_lengths)
    # print(outputs)
    # Calculate CTC Loss
    loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
    loss.backward()
    optimizer.step()
    # print statistics
    running_loss += loss.item()
    output.clear(output_tags='some_outputs')
  loss_per_epoch=running_loss/(i+1)
  # print(f'Loss: {loss_per_epoch:.3f}')
  return loss_per_epoch
################################################################################
def eval_model(model, optimizer, criterion, loader, metric):
  running_loss = 0.0
  wer_calc=0.0
  random_index_per_epoch= random.randint(0, 178)
  for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
    # with output.use_tags('some_outputs'):
    #   sys.stdout.write('Batch: '+ str(i+1)+'/72')
    #   sys.stdout.flush();
    #sorting inputs and targets to have targets in descending order based on len
    sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
    sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
    sorted_audio_len=torch.zeros(32,dtype=torch.int)
    sorted_translations=torch.zeros((32,78),dtype=torch.int)
    sorted_translation_len=sorted_list
    for index, contentof in enumerate(translation_len):
      sorted_audio[index]=audio[sorted_indices[index]]
      sorted_audio_len[index]=audio_len[sorted_indices[index]]
      sorted_translations[index]=translations[sorted_indices[index]]
    #transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
    inputs=sorted_audio.to(device)
    inputs=torch.transpose(inputs, 1, 2)
    input_lengths=sorted_audio_len
    targets=sorted_translations.to(device)
    target_lengths=sorted_translation_len
    # Forward propagate
    outputs, output_lengths = model(inputs, input_lengths)
    # print(outputs)
    # Calculate CTC Loss
    loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
    print(output_lengths)
    print(target_lengths)
    # outputs_in_words=words_vocab.convert_pred_to_words(outputs.transpose(0, 1))
    # targets_in_words=words_vocab.convert_pred_to_words(targets)
    # wer=metrics_calculation(metric, outputs_in_words,targets_in_words)
    
    break
    if (i==random_index_per_epoch):
        print(outputs_in_words,targets_in_words)
    running_loss += loss.item()
    # wer_calc += wer
    output.clear(output_tags='some_outputs')
  loss_per_epoch=running_loss/(i+1)
  wer_per_epoch=wer_calc/(i+1)
  return loss_per_epoch, wer_per_epoch
################################################################################
def train_eval_model(epochs):
  #conformer model init
  model = nn.DataParallel(Conformer(num_classes=16121, input_dim=201, encoder_dim=32, num_encoder_layers=1)).to(device)
  # Optimizers specified in the torch.optim package
  optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
  #loss function
  criterion = nn.CTCLoss().to(device)
  #metrics init
  metric=torchmetrics.WordErrorRate()
  for epoch in range(epochs):
    print("Epoch", epoch+1)
    ############################################################################
    #TRAINING      
    model.train()
    print("Training")
    # epoch_loss=train_model(model=model,optimizer=optimizer, criterion=criterion, loader=train_loader, metric=metric)
    # print(f'Loss: {epoch_loss:.3f}')
    # print(f'WER: {epoch_wer:.3f}')
    ############################################################################
    #EVALUATION
    model.train(False)
    print("Validation")
    epoch_val_loss, epoch_val_wer=eval_model(model=model,optimizer=optimizer, criterion=criterion, loader=test_loader, metric=metric)
    
    print(f'Loss: {epoch_val_loss:.3f}')     
    print(f'WER: {epoch_val_wer:.3f}')   
################################################################################
def metrics_calculation(metric, predictions, targets):
    print(predictions)
    print(targets)
    wer=metric(predictions, targets)
    return wer
train_eval_model(1)