Exom9434 · January 15, 2025 09:23
diff --git a/법령정보 공동활용_sample_code.py b/법령정보 공동활용_sample_code.py
 import pandas as pd
 import re
 import os
 import xml.etree.ElementTree as ET
 from urllib.request import urlopen
 from tqdm import trange

 # 디렉토리 생성(없으면)
 output_dir = '법령별 csv 저장'
 if not os.path.exists(output_dir):
    os.makedirs(output_dir)

 # CSV 파일 로드
 law_list = pd.read_csv("법령목록.csv") #여기 파일 이름 지정

 def remove_tag(content):
    """XML에서 사용된 특수문자 제거용 함수"""
    return re.sub('<.*?>', '', content)

 ### {mst_value}를 바꿔가면서 사용할 베이스 url
 ### OC 파트 반드시 확인!
 url_template = "https://www.law.go.kr/DRF/lawService.do?OC=test&target=law&MST={mst_value}&type=XML"


 # 법령 목록에 있는 각 항목에 대해 반복 처리
 for i in trange(len(law_list)):
    mst_value = law_list.loc[i, '법령MST']  # MST 값: 각 법령을 식별하는 고유 ID
    법령명 = law_list.loc[i, '법령명'].strip()  # 법령명: 법령의 이름
    url = url_template.format(mst_value=mst_value)  # MST 값을 사용하여 URL 생성
    
    try:
        # URL에서 XML 데이터 요청 및 읽기
        response = urlopen(url).read()  # URL로부터 응답 데이터 읽기
        response = response.decode('utf-8', errors='ignore')  # UTF-8로 디코딩하여 문자열로 변환
        response = re.sub(r'<script.*?/>', '', response)  # <script/> 태그 제거 (불필요한 내용 정리)
        
        # 응답이 XML 형식인지 확인
        if response.strip().startswith('<?xml'):
            root = ET.fromstring(response)  # XML 파싱하여 ElementTree 객체로 변환
        else:
            # 응답이 XML 형식이 아닐 경우 경고 메시지를 출력하고 다음 법령으로 넘어감
            print(f"Non-XML response received for {법령명}")
            continue

        all_rows = []  # 수집된 모든 법령 데이터를 저장할 리스트 초기화

        # XML에서 <조문단위> 태그를 모두 찾아서 반복 처리
        for clause in root.findall(".//조문단위"):
            조문제목 = clause.find('조문제목')  # 조문제목 태그 찾기

            # 기본 조문 정보 수집
            조문정보 = {
                '법령명': 법령명,
                '조문번호': clause.find('조문번호').text if clause.find('조문번호') is not None else '',  # 조문번호가 없는 경우 빈 문자열 처리
                '조문제목': remove_tag(조문제목.text) if 조문제목 is not None else '',  # 조문제목 태그가 있는 경우 태그 제거 후 추가
                '조문내용': remove_tag(clause.find('조문내용').text) if clause.find('조문내용') is not None else '',  # 조문내용 태그가 있는 경우 태그 제거 후 추가
                '조문시행일자': clause.find('조문시행일자').text if clause.find('조문시행일자') is not None else '',  # 시행일자가 없는 경우 빈 문자열 처리
                '조문변경여부': clause.find('조문변경여부').text if clause.find('조문변경여부') is not None else ''  # 변경여부가 없는 경우 빈 문자열 처리
            }

            # 조문내용이 있는 경우만 기본 조문 정보를 추가
            if 조문정보['조문내용']:
                all_rows.append({**조문정보, '항': '', '호': '', '목': ''})

            # <항> 태그를 찾아서 반복 처리
            for depth1 in clause.findall('항'):
                항내용 = depth1.find('항내용').text.strip() if depth1.find('항내용') is not None else ''  # 항내용 태그가 없는 경우 빈 문자열 처리
                항_row = {**조문정보, '항': 항내용, '호': '', '목': ''}  # 항 정보를 포함한 새로운 행 생성
                all_rows.append(항_row)  # 항 정보를 추가

                # <호> 태그를 찾아서 반복 처리
                for depth2 in depth1.findall('호'):
                    호번호 = depth2.find('호번호').text.strip() if depth2.find('호번호') is not None else ''  # 호번호가 없는 경우 빈 문자열 처리
                    호내용 = depth2.find('호내용').text.strip() if depth2.find('호내용') is not None else ''  # 호내용이 없는 경우 빈 문자열 처리
                    호_row = {**항_row, '호': f"{호번호} {호내용}", '목': ''}  # 호 정보를 포함한 새로운 행 생성
                    all_rows.append(호_row)  # 호 정보를 추가

                    # <목> 태그를 찾아서 반복 처리
                    for depth3 in depth2.findall('목'):
                        목내용 = depth3.find('목내용').text.strip() if depth3.find('목내용') is not None else ''  # 목내용이 없는 경우 빈 문자열 처리
                        목_row = {**호_row, '목': 목내용}  # 목 정보를 포함한 새로운 행 생성
                        all_rows.append(목_row)  # 목 정보를 추가

        # 수집한 데이터를 CSV 파일로 저장
        if all_rows:
            file_path = os.path.join(output_dir, f"{법령명}.csv")  # 저장할 파일 경로 지정
            pd.DataFrame(all_rows).to_csv(file_path, index=False, encoding='utf-8-sig', sep='\t')  # CSV 파일로 저장
            print(f"Saved {법령명} to {output_dir}")
        else:
            # 수집된 데이터가 없을 경우 경고 메시지 출력
            print(f"No data found for {법령명}")

    except Exception as e:
        # 에러 발생 시 에러 메시지와 원본 응답 출력 후 루프 종료
        print(f"Error fetching or parsing XML for {법령명} (MST: {mst_value}): {e}")
        print("Raw response:")
        print(response)  # 에러 발생 시 원본 응답 출력
        break  # 루프 종료
	import pandas as pd
	import re
	import os
	import xml.etree.ElementTree as ET
	from urllib.request import urlopen
	from tqdm import trange

	# 디렉토리 생성(없으면)
	output_dir = '법령별 csv 저장'
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# CSV 파일 로드
	law_list = pd.read_csv("법령목록.csv") #여기 파일 이름 지정

	def remove_tag(content):
	"""XML에서 사용된 특수문자 제거용 함수"""
	return re.sub('<.*?>', '', content)

	### {mst_value}를 바꿔가면서 사용할 베이스 url
	### OC 파트 반드시 확인!
	url_template = "https://www.law.go.kr/DRF/lawService.do?OC=test&target=law&MST={mst_value}&type=XML"


	# 법령 목록에 있는 각 항목에 대해 반복 처리
	for i in trange(len(law_list)):
	mst_value = law_list.loc[i, '법령MST'] # MST 값: 각 법령을 식별하는 고유 ID
	법령명 = law_list.loc[i, '법령명'].strip() # 법령명: 법령의 이름
	url = url_template.format(mst_value=mst_value) # MST 값을 사용하여 URL 생성

	try:
	# URL에서 XML 데이터 요청 및 읽기
	response = urlopen(url).read() # URL로부터 응답 데이터 읽기
	response = response.decode('utf-8', errors='ignore') # UTF-8로 디코딩하여 문자열로 변환
	response = re.sub(r'<script.*?/>', '', response) # <script/> 태그 제거 (불필요한 내용 정리)

	# 응답이 XML 형식인지 확인
	if response.strip().startswith('<?xml'):
	root = ET.fromstring(response) # XML 파싱하여 ElementTree 객체로 변환
	else:
	# 응답이 XML 형식이 아닐 경우 경고 메시지를 출력하고 다음 법령으로 넘어감
	print(f"Non-XML response received for {법령명}")
	continue

	all_rows = [] # 수집된 모든 법령 데이터를 저장할 리스트 초기화

	# XML에서 <조문단위> 태그를 모두 찾아서 반복 처리
	for clause in root.findall(".//조문단위"):
	조문제목 = clause.find('조문제목') # 조문제목 태그 찾기

	# 기본 조문 정보 수집
	조문정보 = {
	'법령명': 법령명,
	'조문번호': clause.find('조문번호').text if clause.find('조문번호') is not None else '', # 조문번호가 없는 경우 빈 문자열 처리
	'조문제목': remove_tag(조문제목.text) if 조문제목 is not None else '', # 조문제목 태그가 있는 경우 태그 제거 후 추가
	'조문내용': remove_tag(clause.find('조문내용').text) if clause.find('조문내용') is not None else '', # 조문내용 태그가 있는 경우 태그 제거 후 추가
	'조문시행일자': clause.find('조문시행일자').text if clause.find('조문시행일자') is not None else '', # 시행일자가 없는 경우 빈 문자열 처리
	'조문변경여부': clause.find('조문변경여부').text if clause.find('조문변경여부') is not None else '' # 변경여부가 없는 경우 빈 문자열 처리
	}

	# 조문내용이 있는 경우만 기본 조문 정보를 추가
	if 조문정보['조문내용']:
	all_rows.append({**조문정보, '항': '', '호': '', '목': ''})

	# <항> 태그를 찾아서 반복 처리
	for depth1 in clause.findall('항'):
	항내용 = depth1.find('항내용').text.strip() if depth1.find('항내용') is not None else '' # 항내용 태그가 없는 경우 빈 문자열 처리
	항_row = {**조문정보, '항': 항내용, '호': '', '목': ''} # 항 정보를 포함한 새로운 행 생성
	all_rows.append(항_row) # 항 정보를 추가

	# <호> 태그를 찾아서 반복 처리
	for depth2 in depth1.findall('호'):
	호번호 = depth2.find('호번호').text.strip() if depth2.find('호번호') is not None else '' # 호번호가 없는 경우 빈 문자열 처리
	호내용 = depth2.find('호내용').text.strip() if depth2.find('호내용') is not None else '' # 호내용이 없는 경우 빈 문자열 처리
	호_row = {**항_row, '호': f"{호번호} {호내용}", '목': ''} # 호 정보를 포함한 새로운 행 생성
	all_rows.append(호_row) # 호 정보를 추가

	# <목> 태그를 찾아서 반복 처리
	for depth3 in depth2.findall('목'):
	목내용 = depth3.find('목내용').text.strip() if depth3.find('목내용') is not None else '' # 목내용이 없는 경우 빈 문자열 처리
	목_row = {**호_row, '목': 목내용} # 목 정보를 포함한 새로운 행 생성
	all_rows.append(목_row) # 목 정보를 추가

	# 수집한 데이터를 CSV 파일로 저장
	if all_rows:
	file_path = os.path.join(output_dir, f"{법령명}.csv") # 저장할 파일 경로 지정
	pd.DataFrame(all_rows).to_csv(file_path, index=False, encoding='utf-8-sig', sep='\t') # CSV 파일로 저장
	print(f"Saved {법령명} to {output_dir}")
	else:
	# 수집된 데이터가 없을 경우 경고 메시지 출력
	print(f"No data found for {법령명}")

	except Exception as e:
	# 에러 발생 시 에러 메시지와 원본 응답 출력 후 루프 종료
	print(f"Error fetching or parsing XML for {법령명} (MST: {mst_value}): {e}")
	print("Raw response:")
	print(response) # 에러 발생 시 원본 응답 출력
	break # 루프 종료