Araby commited on
Commit
4a14093
1 Parent(s): f04d012

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +42 -0
  2. requirements.txt +64 -0
main.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import GPT2TokenizerFast, AutoModelForCausalLM
3
+ from arabert.preprocess import ArabertPreprocessor
4
+
5
+ # Load model and tokenizer
6
+ model_name = "malmarjeh/gpt2"
7
+ tokenizer = GPT2TokenizerFast.from_pretrained("aubmindlab/aragpt2-base")
8
+ model = AutoModelForCausalLM.from_pretrained(model_name)
9
+ preprocessor = ArabertPreprocessor(model_name=model_name)
10
+
11
+ # Streamlit UI
12
+ st.title('Arabic Text Summarizer')
13
+ text = st.text_area("Paste your Arabic text here:")
14
+
15
+ if st.button('Summarize'):
16
+ if text:
17
+ # Preprocess and tokenize input text
18
+ processed_text = preprocessor.preprocess(text)
19
+ formatted_text = '\n النص: ' + processed_text + ' \n الملخص: \n '
20
+ tokenizer.add_special_tokens({'pad_token': '<pad>'})
21
+ tokens = tokenizer.batch_encode_plus([formatted_text], return_tensors='pt', padding='max_length',
22
+ max_length=150)
23
+
24
+ # Generate summary
25
+ output = model.generate(
26
+ input_ids=tokens['input_ids'],
27
+ repetition_penalty=2.0,
28
+ num_beams=5,
29
+ max_length=600,
30
+ pad_token_id=tokenizer.pad_token_id,
31
+ eos_token_id=tokenizer.eos_token_id,
32
+ bos_token_id=tokenizer.bos_token_id,
33
+ )
34
+
35
+ # Decode and display the summarized text
36
+ result = tokenizer.decode(output[0][150:], skip_special_tokens=True).strip()
37
+ st.subheader("Original Text")
38
+ st.write(text)
39
+ st.subheader("Summarized Text")
40
+ st.write(result)
41
+ else:
42
+ st.warning("Please enter Arabic text to summarize.")
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.1.2
2
+ arabert==1.0.1
3
+ attrs==23.1.0
4
+ blinker==1.6.3
5
+ cachetools==5.3.1
6
+ certifi==2023.7.22
7
+ charset-normalizer==3.3.0
8
+ click==8.1.7
9
+ emoji==1.4.2
10
+ farasapy==0.0.14
11
+ filelock==3.12.4
12
+ fsspec==2023.9.2
13
+ gitdb==4.0.10
14
+ GitPython==3.1.37
15
+ huggingface-hub==0.17.3
16
+ idna==3.4
17
+ importlib-metadata==6.8.0
18
+ Jinja2==3.1.2
19
+ jsonschema==4.19.1
20
+ jsonschema-specifications==2023.7.1
21
+ markdown-it-py==3.0.0
22
+ MarkupSafe==2.1.3
23
+ mdurl==0.1.2
24
+ mpmath==1.3.0
25
+ networkx==3.1
26
+ numpy==1.26.0
27
+ packaging==23.2
28
+ pandas==2.1.1
29
+ Pillow==10.0.1
30
+ protobuf==4.24.4
31
+ PyArabic==0.6.15
32
+ pyarrow==13.0.0
33
+ pydeck==0.8.1b0
34
+ Pygments==2.16.1
35
+ python-dateutil==2.8.2
36
+ pytz==2023.3.post1
37
+ PyYAML==6.0.1
38
+ referencing==0.30.2
39
+ regex==2023.10.3
40
+ requests==2.31.0
41
+ rich==13.6.0
42
+ rpds-py==0.10.4
43
+ safetensors==0.4.0
44
+ sentencepiece==0.1.99
45
+ six==1.16.0
46
+ smmap==5.0.1
47
+ streamlit==1.27.2
48
+ sympy==1.12
49
+ tenacity==8.2.3
50
+ tokenizers==0.14.1
51
+ toml==0.10.2
52
+ toolz==0.12.0
53
+ torch==2.1.0
54
+ torchaudio==2.1.0
55
+ torchvision==0.16.0
56
+ tornado==6.3.3
57
+ tqdm==4.66.1
58
+ transformers==4.34.0
59
+ typing_extensions==4.8.0
60
+ tzdata==2023.3
61
+ tzlocal==5.1
62
+ urllib3==2.0.6
63
+ validators==0.22.0
64
+ zipp==3.17.0