hdallatorre commited on
Commit
f1ff02b
1 Parent(s): d65529d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -3
README.md CHANGED
@@ -42,9 +42,14 @@ import torch
42
  tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
43
  model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
44
 
 
 
 
 
 
45
  # Create a dummy dna sequence and tokenize it
46
- sequences = ['ATTCTG' * 9]
47
- tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt")["input_ids"]
48
 
49
  # Compute the embeddings
50
  attention_mask = tokens_ids != tokenizer.pad_token_id
@@ -60,8 +65,11 @@ embeddings = torch_outs['hidden_states'][-1].detach().numpy()
60
  print(f"Embeddings shape: {embeddings.shape}")
61
  print(f"Embeddings per token: {embeddings}")
62
 
 
 
 
63
  # Compute mean embeddings per sequence
64
- mean_sequence_embeddings = torch.sum(attention_mask.unsqueeze(-1)*embeddings, axis=-2)/torch.sum(attention_mask, axis=-1)
65
  print(f"Mean sequence embeddings: {mean_sequence_embeddings}")
66
  ```
67
 
 
42
  tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
43
  model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
44
 
45
+ # Choose the length to which the input sequences are padded. By default, the
46
+ # model max length is chosen, but feel free to decrease it as the time taken to
47
+ # obtain the embeddings increases significantly with it.
48
+ max_length = tokenizer.model_max_length
49
+
50
  # Create a dummy dna sequence and tokenize it
51
+ sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
52
+ tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
53
 
54
  # Compute the embeddings
55
  attention_mask = tokens_ids != tokenizer.pad_token_id
 
65
  print(f"Embeddings shape: {embeddings.shape}")
66
  print(f"Embeddings per token: {embeddings}")
67
 
68
+ # Add embed dimension axis
69
+ attention_mask = torch.unsqueeze(attention_mask, dim=-1)
70
+
71
  # Compute mean embeddings per sequence
72
+ mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)
73
  print(f"Mean sequence embeddings: {mean_sequence_embeddings}")
74
  ```
75