feat: 2023-2025 dottorandi (#132)

* fix: indentation * feat: add dottorandi 2023 * chore: minor update * fix: lint
UNICT-DMI · Nov 4, 2023 · 14e0229 · 14e0229
1 parent d6133a9
commit 14e0229
Show file tree

Hide file tree

Showing 29 changed files with 31,996 additions and 6,125 deletions.
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -35,7 +35,10 @@
         "named": "never",
         "asyncArrow": "always"
       }
-    ]
+    ],
+    "multiline-ternary": "off",
+    "comma-dangle": "off",
+    "indent": "off"
   },
   "settings": {
     "react": {

diff --git a/jsonParser/CdlInf.py b/jsonParser/CdlInf.py
@@ -1,31 +1,42 @@
 from sys import flags
+from typing import List
 from Target import Target
 import json
 
 @Target.register
-class CdlInf(Target):
+class CdlInf(Target): # or PhD
 
     def __control_quorum(self, text, quorum) -> bool:
         if not quorum:
             return False
+
         if not self.word_not_in_control("NO QUORUM", text):
             text[self.i] = text[self.i].replace("NO QUORUM", " ").strip()
             return False
+
         return True
 
     def __find_name_department(self, text, quorum) -> str:
         self.word_not_in_update("ELEZIONI", text)
         self.i += 1
+
         quorum[0] = self.__control_quorum(text, quorum[0])
         nome = text[self.i]
+
         self.i += 1
         while self.word_not_in_control("BIENNIO", text):
             if len(text[self.i]) > 5:
                 quorum[0] = self.__control_quorum(text, quorum[0])
                 nome += "\n" + text[self.i]
             self.i += 1
+
         return nome.strip()
 
+    def __seggi_da_assegnare_v2(self, text: List[str]) -> int:
+        voti_idx = text.index("VOTI")
+        seggi = [s for s in text[:voti_idx] if self.is_integer(s)][0]
+        return int(seggi)
+
     def __seggi_da_assegnare(self, text) -> int:
         split_text = text[self.i].split()
         try:
@@ -38,42 +49,95 @@ def __seggi_da_assegnare(self, text) -> int:
                 split_text = text[self.i].split()
             return int(split_text[len(split_text)-1])
 
-    def __get_candidati(self, text, eletti, non_eletti, current_quorum) -> bool:
+    def __get_year(self, text: List[str]) -> str:
+        biennio = text[2].replace("BIENNIO ", "")
+        year = biennio[:biennio.find("/")]
+        return year
+
+    def __remove_perc_or_comma(self, value: str) -> str:
+        return value.replace(" ", "").replace("%", "").replace(",", ".")
+
+    def __get_type_v2(self, type: str, text: List[str]) -> str:
+        type_idx = text.index(type)
+        value = self.__remove_perc_or_comma(text[type_idx+2])
+        return value
+
+    def __get_candidati(self, text: List[str], eletti, non_eletti, current_quorum) -> bool:
+        year = self.__get_year(text)
+
         self.word_not_in_update("CANDIDATI", text)
         self.i += 1
         quorum = current_quorum
-        while self.word_not_in_control("SCHEDE", text):
-            split_text = text[self.i].split()
-            if len(split_text) > 0:
-                if self.is_integer(split_text[0]):
-                    split_text.pop(0)
-                if len(split_text) <= 0:
-                    self.i += 1
-                    continue
-                nome_candidato = ""
-                voti_candidato = 0
-                eletto = False
-                for s in split_text:
-                    if self.is_integer(s):
-                        voti_candidato = s
-                    elif "ELETTO" == s.upper():
-                        eletto = True
-                    else:
-                        nome_candidato += (s + " ")
-                if not self.word_not_in_control("NO QUORUM", text):
-                    quorum = False
-                    nome_candidato = nome_candidato.replace("NO QUORUM", " ").strip()
-                if not self.word_not_in_control("EX AEQUO", text):
-                    nome_candidato = nome_candidato.replace("EX AEQUO", " ").strip()
+
+        if "DOTTORANDI" in text[0] and year == "2023":
+            voti_idx = text.index("VOTI") + 1
+            schede_idx = text.index("SCHEDE BIANCHE")
+
+            candidates = [el for el in text[voti_idx:schede_idx] if el != "" ]
+
+            idx = 0
+            while idx < len(candidates)-1:
+                nome_candidato = candidates[idx]
+                voti_candidato = candidates[idx+1]
+
                 e = {
                         "nome_candidato": nome_candidato.strip(),
                         "voti": int(voti_candidato)
                 }
-                if eletto:
+
+                if len(candidates) > idx+2 and candidates[idx+2].upper() == "ELETTO":
+                    idx += 1
                     eletti.append(e)
                 else:
                     non_eletti.append(e)
-            self.i += 1
+
+                idx += 2
+
+            self.i += len(text[voti_idx:schede_idx])
+        else:
+            while self.word_not_in_control("SCHEDE", text):
+                split_text = text[self.i].split()
+
+                # if the line is not empty
+                if len(split_text) > 0:
+
+                    if self.is_integer(split_text[0]):
+                        split_text.pop(0)
+
+                    if len(split_text) <= 0:
+                        self.i += 1
+                        continue
+
+                    nome_candidato = ""
+                    voti_candidato = 0
+                    eletto = False
+                    for s in split_text:
+                        if self.is_integer(s):
+                            voti_candidato = s
+                        elif "ELETTO" == s.upper():
+                            eletto = True
+                        else:
+                            nome_candidato += (s + " ")
+
+                    if not self.word_not_in_control("NO QUORUM", text):
+                        quorum = False
+                        nome_candidato = nome_candidato.replace("NO QUORUM", " ").strip()
+
+                    if not self.word_not_in_control("EX AEQUO", text):
+                        nome_candidato = nome_candidato.replace("EX AEQUO", " ").strip()
+
+                    e = {
+                            "nome_candidato": nome_candidato.strip(),
+                            "voti": int(voti_candidato)
+                    }
+
+                    if eletto:
+                        eletti.append(e)
+                    else:
+                        non_eletti.append(e)
+
+                self.i += 1
+
         return quorum
 
     def __get_type(self, word, text) -> float:
@@ -89,18 +153,39 @@ def __get_type(self, word, text) -> float:
                 return 0.0
 
     def __operation(self, text) -> object:
+        year = self.__get_year(text)
+
         quorum = [True]
         nome_dipartimento = self.__find_name_department(text, quorum)
-        seggi = self.__seggi_da_assegnare(text)
+
+        if year == "2023":
+            seggi = self.__seggi_da_assegnare_v2(text)
+        else:
+            seggi = self.__seggi_da_assegnare(text)
+
         eletti = []
         non_eletti = []
         quorum[0] = self.__get_candidati(text, eletti, non_eletti, quorum[0])
-        schede_bianche = int(self.__get_type("BIANCHE", text))
-        schede_nulle = int(self.__get_type("NULLE", text))
-        schede_contestate = int(self.__get_type("CONTESTATE", text))
-        totale_voti = int(self.__get_type("VOTI", text))
-        aventi_diritto = int(self.__get_type("DIRITTO", text))
-        perc_votanti = self.__get_type("VOTANTI", text)
+
+        schede_bianche = int(self.__get_type("BIANCHE", text)) # TODO use v2 for 2023
+
+        if year != "2023":
+            schede_nulle = int(self.__get_type("NULLE", text))
+            schede_contestate = int(self.__get_type("CONTESTATE", text))
+        else:
+            schede_nulle = 0
+            schede_contestate = 0
+
+        if year == "2023":
+            totale_voti = int(self.__get_type_v2("TOTALE VOTI", text))
+            aventi_diritto = int(self.__get_type_v2("AVENTI DIRITTO", text))
+            perc_votanti = float(self.__get_type_v2("% VOTANTI", text))
+        else:
+            totale_voti = int(self.__get_type("VOTI", text))
+            aventi_diritto = int(self.__get_type("DIRITTO", text))
+            perc_votanti = self.__get_type("VOTANTI", text)
+
+
         file_json = {
             "dipartimento": nome_dipartimento,
             "quorum": quorum[0],
@@ -119,6 +204,10 @@ def __operation(self, text) -> object:
         self.i += 1
         file = json.dumps(file_json)
         parsed = json.loads(file)
+
+        if year != "2018":
+            self.i = len(text)-1
+
         return json.dumps(parsed, indent=4, sort_keys=False)
 
 

diff --git a/jsonParser/README.md b/jsonParser/README.md
@@ -71,7 +71,7 @@ If you don't want to install anything go **[here](#usage-with-docker)**.
 
   · `0` if you want to extract departments and CdL with a number of student greater than 500.
 
-  · `1` if you want to extract CdL with a number of student fewer than 500.
+  · `1` if you want to extract CdL with a number of student fewer than 500 or PhD student elections.
 
   · `2` if you want to extract Medicine election.
 

diff --git a/jsonParser/create-json.py b/jsonParser/create-json.py
@@ -15,58 +15,69 @@
 end_parser = ["</ul>", "</ul></div></div></div>", "</span></div></div></div>", "</div></div></div>"]
 
 match1 = ["CONSIGLIO", "SENATO", "NUCLEO", "COMITATO"]
-match2 = ["DOTTORANDI", "CDL_B", "INFERIORE"]
+match2 = ["DOTT", "DOTTORANDI", "CDL_B", "INFERIORE"]
 match3 = ["COORDIMAMENTO", "COORDINAMENTO"]
 
-def is_file(filename) -> bool:
+def is_file(filename: str) -> bool:
     file = filename.split(".")
     return(bool(len(file) > 1 and file[len(file)-1] == "pdf"))
 
-def create_json(pathname, option, command) -> None:
+def create_json(pathname: str, option: str, command: str) -> None:
     print("Create JSON: " + pathname)
     status = os.system("python3 " + command + "parser.py " + "\"" + pathname + "\" " + option)
+
     if os.WEXITSTATUS(status) > 0:
         print("I try to create the file again: " + pathname)
         os.system("python3 " + command + "parser.py " + "\"" + pathname + "\" 0")
 
-def sub_url(url, directory, command) -> None:
+def sub_url(url: str, directory: str, command: str) -> None:
     try:
         os.makedirs(directory, mode = 0o777, exist_ok = True)
     except ValueError:
         pass
+
     x = requests.get(url)
+
     if x.status_code != 200:
         print("ERROR", x.status_code, ":", x.text)
         sys.exit(-1)
+
     index1 = -1
     max = -1
     for el in start_parser:
         if max < x.text.find(el):
             max = x.text.find(el)
+
     index1 = max
     if index1 < 0:
         print("ERROR in find substring 1")
         sys.exit(-1)
+
     text = x.text[index1:]
     min = sys.maxsize
     for el in end_parser:
         if min > text.find(el) and text.find(el) > 0:
             min = text.find(el)
+
     index2 = min
     if index2 < 0:
         print("ERROR in find substring 2")
         sys.exit(-1)
+
     text = text[0:index2]
     webpage = html.fromstring(text)
     for link in webpage.xpath('//a/@href'):
         if link.find("https://") < 0 and link.find("http://") < 0:
             link = "https://www.unict.it" + link
+
         file = link.split("/")
         f = file[len(file)-1]
         if is_file(f):
             f = f.replace("%20", "_")
             f = f.replace("%2", "_")
+
             open(directory + "/" + f, "wb").write(requests.get(link).content)
+
             if any(s in f.upper() for s in match1):
                 create_json(directory + "/" + f, "other", command)
             elif any(s in f.upper() for s in match3):
@@ -83,10 +94,13 @@ def main(argv) -> None:
     if len(argv) != 3:
         print("USAGE: python3 create-json.py <url> <start_directory> <command_parser_directory>")
         sys.exit(0)
+
     if argv[1][len(argv[1])-1] == "/":
         argv[1] = argv[1][:-1]
+
     if argv[2][len(argv[2])-1] != "/":
         argv[2] += "/"
+
     sub_url(argv[0], argv[1], argv[2])
 
 if __name__ == "__main__":

diff --git a/jsonParser/fix-json-names.py b/jsonParser/fix-json-names.py
@@ -0,0 +1,28 @@
+import json
+
+def create_file_name(s, path) -> str:
+    input_path = ""
+
+    file_name = ''
+    with open(s, "r") as f:
+        file_name = f.readlines()[1].replace('    "dipartimento": "', '').replace('",', '')
+
+    file_name = file_name.replace(",", '').replace(" ", "_").replace("/", "_").strip().lower().split("\n")[0] + ".json"
+    tmp = ""
+    for p in path:
+        tmp += p
+        if p == "/":
+            input_path = tmp
+    file_path = input_path + file_name
+    file_path = file_path.replace("dipartimento_di_", "")
+    file_path = file_path[2].upper() + file_path[3:]
+    return file_path
+
+import os
+files = [f for f in os.listdir('.') if os.path.isfile(f)]
+for f in files:
+    if ".json" in f:
+        print(f)
+        file_name = create_file_name(f, './')
+        print(file_name)
+        os.rename(f, file_name)
diff --git a/jsonParser/parser.py b/jsonParser/parser.py
@@ -3,6 +3,7 @@
 from re import split
 import sys
 import json
+from typing import List
 from FormatPDF import FormatPDF
 from Target import Target
 from SelectTarget import SelectTarget
@@ -58,7 +59,7 @@ def create_file_name(s, path) -> str:
             input_path = tmp
     return input_path + file_name
 
-def main(argv) -> None:
+def main(argv: List[str]) -> None:
     error_start(len(argv))
 
     """ ONLY for add SCRUTINATI """
@@ -71,8 +72,10 @@ def main(argv) -> None:
     # print_pars(formatted_text)
     # print(len(formatted_text))
     # print(argv[0])
+
     str_json = target.scrape_list(formatted_text)
     # print(str_json)
+
     if isinstance(str_json, list):
         for s in str_json:
             if len(str_json) > 1:
@@ -84,4 +87,4 @@ def main(argv) -> None:
         save_json(str_json, argv[0])
 
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main(sys.argv[1:])