Skip to content

Commit

Permalink
feat: 2023-2025 dottorandi (#132)
Browse files Browse the repository at this point in the history
* fix: indentation

* feat: add dottorandi 2023

* chore: minor update

* fix: lint
  • Loading branch information
Helias authored Nov 4, 2023
1 parent d6133a9 commit 14e0229
Show file tree
Hide file tree
Showing 29 changed files with 31,996 additions and 6,125 deletions.
5 changes: 4 additions & 1 deletion .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@
"named": "never",
"asyncArrow": "always"
}
]
],
"multiline-ternary": "off",
"comma-dangle": "off",
"indent": "off"
},
"settings": {
"react": {
Expand Down
157 changes: 123 additions & 34 deletions jsonParser/CdlInf.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,42 @@
from sys import flags
from typing import List
from Target import Target
import json

@Target.register
class CdlInf(Target):
class CdlInf(Target): # or PhD

def __control_quorum(self, text, quorum) -> bool:
if not quorum:
return False

if not self.word_not_in_control("NO QUORUM", text):
text[self.i] = text[self.i].replace("NO QUORUM", " ").strip()
return False

return True

def __find_name_department(self, text, quorum) -> str:
self.word_not_in_update("ELEZIONI", text)
self.i += 1

quorum[0] = self.__control_quorum(text, quorum[0])
nome = text[self.i]

self.i += 1
while self.word_not_in_control("BIENNIO", text):
if len(text[self.i]) > 5:
quorum[0] = self.__control_quorum(text, quorum[0])
nome += "\n" + text[self.i]
self.i += 1

return nome.strip()

def __seggi_da_assegnare_v2(self, text: List[str]) -> int:
voti_idx = text.index("VOTI")
seggi = [s for s in text[:voti_idx] if self.is_integer(s)][0]
return int(seggi)

def __seggi_da_assegnare(self, text) -> int:
split_text = text[self.i].split()
try:
Expand All @@ -38,42 +49,95 @@ def __seggi_da_assegnare(self, text) -> int:
split_text = text[self.i].split()
return int(split_text[len(split_text)-1])

def __get_candidati(self, text, eletti, non_eletti, current_quorum) -> bool:
def __get_year(self, text: List[str]) -> str:
biennio = text[2].replace("BIENNIO ", "")
year = biennio[:biennio.find("/")]
return year

def __remove_perc_or_comma(self, value: str) -> str:
return value.replace(" ", "").replace("%", "").replace(",", ".")

def __get_type_v2(self, type: str, text: List[str]) -> str:
type_idx = text.index(type)
value = self.__remove_perc_or_comma(text[type_idx+2])
return value

def __get_candidati(self, text: List[str], eletti, non_eletti, current_quorum) -> bool:
year = self.__get_year(text)

self.word_not_in_update("CANDIDATI", text)
self.i += 1
quorum = current_quorum
while self.word_not_in_control("SCHEDE", text):
split_text = text[self.i].split()
if len(split_text) > 0:
if self.is_integer(split_text[0]):
split_text.pop(0)
if len(split_text) <= 0:
self.i += 1
continue
nome_candidato = ""
voti_candidato = 0
eletto = False
for s in split_text:
if self.is_integer(s):
voti_candidato = s
elif "ELETTO" == s.upper():
eletto = True
else:
nome_candidato += (s + " ")
if not self.word_not_in_control("NO QUORUM", text):
quorum = False
nome_candidato = nome_candidato.replace("NO QUORUM", " ").strip()
if not self.word_not_in_control("EX AEQUO", text):
nome_candidato = nome_candidato.replace("EX AEQUO", " ").strip()

if "DOTTORANDI" in text[0] and year == "2023":
voti_idx = text.index("VOTI") + 1
schede_idx = text.index("SCHEDE BIANCHE")

candidates = [el for el in text[voti_idx:schede_idx] if el != "" ]

idx = 0
while idx < len(candidates)-1:
nome_candidato = candidates[idx]
voti_candidato = candidates[idx+1]

e = {
"nome_candidato": nome_candidato.strip(),
"voti": int(voti_candidato)
}
if eletto:

if len(candidates) > idx+2 and candidates[idx+2].upper() == "ELETTO":
idx += 1
eletti.append(e)
else:
non_eletti.append(e)
self.i += 1

idx += 2

self.i += len(text[voti_idx:schede_idx])
else:
while self.word_not_in_control("SCHEDE", text):
split_text = text[self.i].split()

# if the line is not empty
if len(split_text) > 0:

if self.is_integer(split_text[0]):
split_text.pop(0)

if len(split_text) <= 0:
self.i += 1
continue

nome_candidato = ""
voti_candidato = 0
eletto = False
for s in split_text:
if self.is_integer(s):
voti_candidato = s
elif "ELETTO" == s.upper():
eletto = True
else:
nome_candidato += (s + " ")

if not self.word_not_in_control("NO QUORUM", text):
quorum = False
nome_candidato = nome_candidato.replace("NO QUORUM", " ").strip()

if not self.word_not_in_control("EX AEQUO", text):
nome_candidato = nome_candidato.replace("EX AEQUO", " ").strip()

e = {
"nome_candidato": nome_candidato.strip(),
"voti": int(voti_candidato)
}

if eletto:
eletti.append(e)
else:
non_eletti.append(e)

self.i += 1

return quorum

def __get_type(self, word, text) -> float:
Expand All @@ -89,18 +153,39 @@ def __get_type(self, word, text) -> float:
return 0.0

def __operation(self, text) -> object:
year = self.__get_year(text)

quorum = [True]
nome_dipartimento = self.__find_name_department(text, quorum)
seggi = self.__seggi_da_assegnare(text)

if year == "2023":
seggi = self.__seggi_da_assegnare_v2(text)
else:
seggi = self.__seggi_da_assegnare(text)

eletti = []
non_eletti = []
quorum[0] = self.__get_candidati(text, eletti, non_eletti, quorum[0])
schede_bianche = int(self.__get_type("BIANCHE", text))
schede_nulle = int(self.__get_type("NULLE", text))
schede_contestate = int(self.__get_type("CONTESTATE", text))
totale_voti = int(self.__get_type("VOTI", text))
aventi_diritto = int(self.__get_type("DIRITTO", text))
perc_votanti = self.__get_type("VOTANTI", text)

schede_bianche = int(self.__get_type("BIANCHE", text)) # TODO use v2 for 2023

if year != "2023":
schede_nulle = int(self.__get_type("NULLE", text))
schede_contestate = int(self.__get_type("CONTESTATE", text))
else:
schede_nulle = 0
schede_contestate = 0

if year == "2023":
totale_voti = int(self.__get_type_v2("TOTALE VOTI", text))
aventi_diritto = int(self.__get_type_v2("AVENTI DIRITTO", text))
perc_votanti = float(self.__get_type_v2("% VOTANTI", text))
else:
totale_voti = int(self.__get_type("VOTI", text))
aventi_diritto = int(self.__get_type("DIRITTO", text))
perc_votanti = self.__get_type("VOTANTI", text)


file_json = {
"dipartimento": nome_dipartimento,
"quorum": quorum[0],
Expand All @@ -119,6 +204,10 @@ def __operation(self, text) -> object:
self.i += 1
file = json.dumps(file_json)
parsed = json.loads(file)

if year != "2018":
self.i = len(text)-1

return json.dumps(parsed, indent=4, sort_keys=False)


Expand Down
2 changes: 1 addition & 1 deletion jsonParser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ If you don't want to install anything go **[here](#usage-with-docker)**.

· `0` if you want to extract departments and CdL with a number of student greater than 500.

· `1` if you want to extract CdL with a number of student fewer than 500.
· `1` if you want to extract CdL with a number of student fewer than 500 or PhD student elections.

· `2` if you want to extract Medicine election.

Expand Down
22 changes: 18 additions & 4 deletions jsonParser/create-json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,58 +15,69 @@
end_parser = ["</ul>", "</ul></div></div></div>", "</span></div></div></div>", "</div></div></div>"]

match1 = ["CONSIGLIO", "SENATO", "NUCLEO", "COMITATO"]
match2 = ["DOTTORANDI", "CDL_B", "INFERIORE"]
match2 = ["DOTT", "DOTTORANDI", "CDL_B", "INFERIORE"]
match3 = ["COORDIMAMENTO", "COORDINAMENTO"]

def is_file(filename) -> bool:
def is_file(filename: str) -> bool:
file = filename.split(".")
return(bool(len(file) > 1 and file[len(file)-1] == "pdf"))

def create_json(pathname, option, command) -> None:
def create_json(pathname: str, option: str, command: str) -> None:
print("Create JSON: " + pathname)
status = os.system("python3 " + command + "parser.py " + "\"" + pathname + "\" " + option)

if os.WEXITSTATUS(status) > 0:
print("I try to create the file again: " + pathname)
os.system("python3 " + command + "parser.py " + "\"" + pathname + "\" 0")

def sub_url(url, directory, command) -> None:
def sub_url(url: str, directory: str, command: str) -> None:
try:
os.makedirs(directory, mode = 0o777, exist_ok = True)
except ValueError:
pass

x = requests.get(url)

if x.status_code != 200:
print("ERROR", x.status_code, ":", x.text)
sys.exit(-1)

index1 = -1
max = -1
for el in start_parser:
if max < x.text.find(el):
max = x.text.find(el)

index1 = max
if index1 < 0:
print("ERROR in find substring 1")
sys.exit(-1)

text = x.text[index1:]
min = sys.maxsize
for el in end_parser:
if min > text.find(el) and text.find(el) > 0:
min = text.find(el)

index2 = min
if index2 < 0:
print("ERROR in find substring 2")
sys.exit(-1)

text = text[0:index2]
webpage = html.fromstring(text)
for link in webpage.xpath('//a/@href'):
if link.find("https://") < 0 and link.find("http://") < 0:
link = "https://www.unict.it" + link

file = link.split("/")
f = file[len(file)-1]
if is_file(f):
f = f.replace("%20", "_")
f = f.replace("%2", "_")

open(directory + "/" + f, "wb").write(requests.get(link).content)

if any(s in f.upper() for s in match1):
create_json(directory + "/" + f, "other", command)
elif any(s in f.upper() for s in match3):
Expand All @@ -83,10 +94,13 @@ def main(argv) -> None:
if len(argv) != 3:
print("USAGE: python3 create-json.py <url> <start_directory> <command_parser_directory>")
sys.exit(0)

if argv[1][len(argv[1])-1] == "/":
argv[1] = argv[1][:-1]

if argv[2][len(argv[2])-1] != "/":
argv[2] += "/"

sub_url(argv[0], argv[1], argv[2])

if __name__ == "__main__":
Expand Down
28 changes: 28 additions & 0 deletions jsonParser/fix-json-names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json

def create_file_name(s, path) -> str:
input_path = ""

file_name = ''
with open(s, "r") as f:
file_name = f.readlines()[1].replace(' "dipartimento": "', '').replace('",', '')

file_name = file_name.replace(",", '').replace(" ", "_").replace("/", "_").strip().lower().split("\n")[0] + ".json"
tmp = ""
for p in path:
tmp += p
if p == "/":
input_path = tmp
file_path = input_path + file_name
file_path = file_path.replace("dipartimento_di_", "")
file_path = file_path[2].upper() + file_path[3:]
return file_path

import os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
if ".json" in f:
print(f)
file_name = create_file_name(f, './')
print(file_name)
os.rename(f, file_name)
7 changes: 5 additions & 2 deletions jsonParser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from re import split
import sys
import json
from typing import List
from FormatPDF import FormatPDF
from Target import Target
from SelectTarget import SelectTarget
Expand Down Expand Up @@ -58,7 +59,7 @@ def create_file_name(s, path) -> str:
input_path = tmp
return input_path + file_name

def main(argv) -> None:
def main(argv: List[str]) -> None:
error_start(len(argv))

""" ONLY for add SCRUTINATI """
Expand All @@ -71,8 +72,10 @@ def main(argv) -> None:
# print_pars(formatted_text)
# print(len(formatted_text))
# print(argv[0])

str_json = target.scrape_list(formatted_text)
# print(str_json)

if isinstance(str_json, list):
for s in str_json:
if len(str_json) > 1:
Expand All @@ -84,4 +87,4 @@ def main(argv) -> None:
save_json(str_json, argv[0])

if __name__ == "__main__":
main(sys.argv[1:])
main(sys.argv[1:])
Loading

0 comments on commit 14e0229

Please sign in to comment.