hdallatorre commited on
Commit
99c473e
1 Parent(s): 4e68cd2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +39 -9
README.md CHANGED
@@ -42,29 +42,59 @@ A small snippet of code is given here in order to retrieve both logits and embed
42
  from transformers import AutoTokenizer, AutoModel
43
  import torch
44
 
45
- tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", use_auth_token=hf_token, trust_remote_code=True)
46
- model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", use_auth_token=hf_token, trust_remote_code=True)
47
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Choose the length to which the input sequences are padded. By default, the
50
  # model max length is chosen, but feel free to decrease it as the time taken to
51
  # obtain the embeddings increases significantly with it.
52
- max_length = tokenizer.model_max_length
 
 
 
 
 
 
53
 
54
  # Create a dummy dna sequence and tokenize it
55
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
56
- tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
57
 
58
- # Compute the embeddings
59
- attention_mask = torch_tokens != tokenizer.pad_token_id
60
  outs = model(
61
- torch_tokens,
62
  attention_mask=attention_mask,
63
  output_hidden_states=True
64
  )
65
 
66
- logits = outs.logits.detach().numpy()
 
 
67
  probabilities = torch.nn.functional.softmax(logits, dim=-1)
 
 
 
 
 
 
68
  ```
69
 
70
 
 
42
  from transformers import AutoTokenizer, AutoModel
43
  import torch
44
 
45
+ features = [
46
+ "protein_coding_gene",
47
+ "lncRNA",
48
+ "exon",
49
+ "intron",
50
+ "splice_donor",
51
+ "splice_acceptor",
52
+ "5UTR",
53
+ "3UTR",
54
+ "CTCF-bound",
55
+ "polyA_signal",
56
+ "enhancer_Tissue_specific",
57
+ "enhancer_Tissue_invariant",
58
+ "promoter_Tissue_specific",
59
+ "promoter_Tissue_invariant",
60
+ ]
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", trust_remote_code=True)
63
+ model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", trust_remote_code=True)
64
 
65
  # Choose the length to which the input sequences are padded. By default, the
66
  # model max length is chosen, but feel free to decrease it as the time taken to
67
  # obtain the embeddings increases significantly with it.
68
+ # The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by
69
+ # 2 to the power of the number of downsampling block, i.e 4.
70
+ max_length = 12 + 1
71
+
72
+ assert (max_length - 1) % 4 == 0, (
73
+ "The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by"
74
+ "2 to the power of the number of downsampling block, i.e 4.")
75
 
76
  # Create a dummy dna sequence and tokenize it
77
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
78
+ tokens = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
79
 
80
+ # Infer
81
+ attention_mask = tokens != tokenizer.pad_token_id
82
  outs = model(
83
+ tokens,
84
  attention_mask=attention_mask,
85
  output_hidden_states=True
86
  )
87
 
88
+ # Obtain the logits over the genomic features
89
+ logits = outs.logits.detach()
90
+ # Transform them in probabilities
91
  probabilities = torch.nn.functional.softmax(logits, dim=-1)
92
+ print(f"Probabilities shape: {probabilities.shape}")
93
+
94
+ # Get probabilities associated with intron
95
+ idx_intron = features.index("intron")
96
+ probabilities_intron = probabilities[:,:,idx_intron]
97
+ print(f"Intron probabilities shape: {probabilities_intron.shape}")
98
  ```
99
 
100