InstaDeepAI
/

nucleotide-transformer-v2-50m-multi-species

Inference Endpoints

Model card Files Files and versions Community

hdallatorre commited on Oct 11, 2023

Commit

f1ff02b

•

1 Parent(s): d65529d

Update README.md

Files changed (1) hide show

README.md +11 -3

README.md CHANGED Viewed

@@ -42,9 +42,14 @@ import torch
 tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
 model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
 # Create a dummy dna sequence and tokenize it
-sequences = ['ATTCTG' * 9]
-tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt")["input_ids"]
 # Compute the embeddings
 attention_mask = tokens_ids != tokenizer.pad_token_id
@@ -60,8 +65,11 @@ embeddings = torch_outs['hidden_states'][-1].detach().numpy()
 print(f"Embeddings shape: {embeddings.shape}")
 print(f"Embeddings per token: {embeddings}")
 # Compute mean embeddings per sequence
-mean_sequence_embeddings = torch.sum(attention_mask.unsqueeze(-1)*embeddings, axis=-2)/torch.sum(attention_mask, axis=-1)
 print(f"Mean sequence embeddings: {mean_sequence_embeddings}")
 ```

 tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
 model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-50m-multi-species", trust_remote_code=True)
+# Choose the length to which the input sequences are padded. By default, the
+# model max length is chosen, but feel free to decrease it as the time taken to
+# obtain the embeddings increases significantly with it.
+max_length = tokenizer.model_max_length
 # Create a dummy dna sequence and tokenize it
+sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
+tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
 # Compute the embeddings
 attention_mask = tokens_ids != tokenizer.pad_token_id
 print(f"Embeddings shape: {embeddings.shape}")
 print(f"Embeddings per token: {embeddings}")
+# Add embed dimension axis
+attention_mask = torch.unsqueeze(attention_mask, dim=-1)
 # Compute mean embeddings per sequence
+mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)
 print(f"Mean sequence embeddings: {mean_sequence_embeddings}")
 ```