혈통 파일을 세대별로 정리하기

 

지난 번 프로그램에 animal과 sire가 동일한 것, animal과 dam이 동일 한 것, sire가 dma과 동일한 것과 같은

 

오류를 찾아내는 로직을 추가했다.

 

프로그램

 

PROGRAM pedi_ana

! program name : pedi_ana
! programmer : Park Byoungho
! usage : sort_ana
! purpose : read pedigree file, calculate the generation and write pedigree file with generation
! date : 2009.11.31.
! update : 2009.12.3 - If one animal is over 2 times, see if sire(dam)s are different

IMPLICIT NONE

INTEGER, PARAMETER :: I4 = SELECTED_INT_KIND(9) ! 4 byte 정수 정의

! store animla, sire, dam and etc info
TYPE list_node
CHARACTER(LEN = 32) :: key, sire, dam
INTEGER(KIND = I4) :: code ! 읽은 순서
INTEGER :: gen
INTEGER :: sex
INTEGER :: count
INTEGER :: dup_err
TYPE(list_node), POINTER :: next
END TYPE list_node

! to create array of pointer
TYPE list_head
TYPE(list_node), POINTER :: next
END TYPE list_head

! bucket size
INTEGER, PARAMETER :: HASHSIZE = 65536_I4

! define bucket of hash
TYPE(list_head), DIMENSION(HASHSIZE) :: bucket

!main

! data dictionary
INTEGER(KIND = I4) :: i ! for loop when reading pedigree data
INTEGER :: j ! for loop, maximum iteration for making generation
CHARACTER(LEN = 32) :: key, sire, dam ! for reading each data line
TYPE(list_node), POINTER :: node, ptr
INTEGER :: dup_err ! error check for (1)animal = sire, (2)animal = dam, (3)sire = dam
! (4) same animal has different sire(dam)
CHARACTER(LEN = 256) :: in_filename, out_filename
INTEGER :: status ! I/O status
CHARACTER(LEN = 256) :: error_msg ! error message

CALL init_bucket()

CALL get_command_argument(1, in_filename) ! get pedigree data filename from the command line
CALL get_command_argument(2, out_filename) ! get output data filename from the command line

! open data file and store data to hash
OPEN(UNIT = 20, FILE = in_filename, STATUS = 'OLD', ACTION = 'READ', IOSTAT = status, IOMSG = error_msg)

IF (status == 0) THEN ! file open success
i = 1
DO
READ(UNIT = 20, FMT = *, IOSTAT = status) key, sire, dam
IF (status /= 0) EXIT ! end of file

dup_err = 0 ! no error

IF (key == sire) THEN ! error for animal = sire
dup_err = 1
END IF

IF (key == dam) THEN ! error for animal = dam
dup_err = 2
END IF

IF (sire == dam .AND. sire /= '0') THEN ! error for sire = dam
dup_err = 3
END IF

CALL add_node(key, sire, dam, i, 1, 0, dup_err)
i = i + 1
END DO
ELSE ! data file open fail
WRITE (*,'(1X, A, A)') 'File open failed -- error message : ', error_msg
STOP
END IF

CLOSE(20)
! end of making hash

DO j = 1, 50 ! iteration
DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
ptr => bucket(i)%next
DO
IF (ASSOCIATED(ptr)) THEN
IF (ptr%sire /= '0') CALL add_gen(ptr%sire, ptr%gen, 2) ! process sire
IF (ptr%dam /= '0') call add_gen(ptr%dam, ptr%gen, 1) ! process dam
!WRITE(*,*) ptr%key
ptr => ptr%next
ELSE
EXIT
END IF
END DO
END IF
END DO ! end of i
END DO ! end of j

CALL write_bucket(out_filename)

CALL clear_bucket()

CONTAINS

FUNCTION hash(key)
INTEGER(KIND = I4) :: hash
CHARACTER(LEN = 32), INTENT(IN) :: key
INTEGER :: i, keylen, uv
INTEGER(KIND = I4) :: hashval

hashval = 5381
keylen = LEN_TRIM(key)
DO i = 1, keylen
!uv = MOD(hashval * 33, 65536_I4)
hashval = MOD(hashval * 33 + IACHAR(key(i:i)), 65536_I4) + 1
END DO
hash = hashval
END FUNCTION

SUBROUTINE init_bucket()
INTEGER :: i
DO i = 1, HASHSIZE
NULLIFY(bucket(i)%next)
END DO
END SUBROUTINE

SUBROUTINE lookup(key, node)
TYPE(list_node), POINTER, INTENT(OUT) :: node
CHARACTER(LEN = 32), INTENT(IN) :: key

node => bucket(hash(key))%next
DO
IF(.NOT. ASSOCIATED(node)) EXIT
IF(node%key == key) EXIT
node => node%next
END DO
END SUBROUTINE lookup

SUBROUTINE add_node(key, sire, dam, code, gen, sex, dup_err)
CHARACTER(LEN = 32), INTENT(IN) :: key, sire, dam
TYPE(list_node), POINTER :: node, tmp
INTEGER(KIND = I4) :: hv, err
INTEGER(KIND = I4), INTENT(IN) :: code
INTEGER, INTENT(IN) :: gen, sex, dup_err

hv = hash(key)

IF(.NOT. ASSOCIATED(bucket(hv)%next)) THEN ! if bucket(hv) is empty, i.e. no data at bucket(hv)
ALLOCATE(node, STAT = err)
IF(err /= 0) STOP 'Error : no memory'

NULLIFY(node%next)
node%key = key
node%sire = sire
node%dam = dam
node%code = code
node%gen = gen
node%sex = sex
node%count = 1
node%dup_err = dup_err
bucket(hv)%next => node
ELSE ! if bucket(hv) is not empty, i.e. some data at bucket(hv)

node => bucket(hv)%next
DO ! see if animal is in the hash
IF(node%key == key) THEN ! if there is already an animal in the hash

node%count = node%count + 1 ! increase the count

IF (node%sire /= sire .OR. node%dam /= dam) THEN ! if new sire(dam) and sire(dam) in the hash are different
node%dup_err = 4
END IF

EXIT
ELSE ! to reach the end of each bucket
IF(ASSOCIATED(node%next)) THEN
node => node%next
ELSE
ALLOCATE(tmp, STAT = err)
IF(err /= 0) STOP 'Error : no memory'
NULLIFY(tmp%next)

tmp%key = key
tmp%sire = sire
tmp%dam = dam
tmp%code = code
tmp%gen = gen
tmp%sex = sex
tmp%count = 1
tmp%dup_err = dup_err
node%next => tmp

EXIT
END IF
END IF
END DO
END IF
END SUBROUTINE

SUBROUTINE add_gen(parent, gen, sex)
CHARACTER(LEN = 32), INTENT(IN) :: parent ! sire or dam for generation addition
INTEGER, INTENT(IN) :: gen, sex ! generation of offsprint, sex of sire or dam
TYPE(list_node), POINTER :: node
CHARACTER(LEN = 32) :: sire = '0', dam = '0'

CALL lookup(parent, node)
IF(ASSOCIATED(node)) THEN ! if there is a sire in the hash
IF ( node%gen < gen + 1) node%gen = gen + 1
IF (node%sex == 0) THEN ! sex is not defined yet
node%sex = sex
ELSE
IF (node%sex /= sex) node%sex = 3 ! sex is strange
END IF
ELSE ! if there is not a sire in the hash
CALL add_node(parent, sire, dam, 0, 1, sex, 0)
END IF

END SUBROUTINE add_gen

SUBROUTINE write_bucket(filename)
CHARACTER(LEN = 40), INTENT(IN) :: filename
INTEGER :: i
TYPE(list_node), POINTER :: node

OPEN(UNIT = 12, FILE = filename, STATUS = 'REPLACE', ACTION = 'WRITE')

DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
node => bucket(i)%next
DO
IF (.NOT. ASSOCIATED(node)) EXIT
WRITE(12,*) node%key, node%sire, node%dam, node%code, node%gen, node%sex, node%count, node%dup_err
node => node%next
END DO
END IF
END DO ! end of i

CLOSE(12)
END SUBROUTINE write_bucket

SUBROUTINE del_list(node)
TYPE(list_node), POINTER :: node, tmp
DO
IF (ASSOCIATED(node)) THEN
!WRITE(*,*) node%key, node%code, node%gen, node%sex
tmp => node%next
DEALLOCATE(node)
node => tmp
ELSE
EXIT
END IF
END DO
END SUBROUTINE del_list

SUBROUTINE clear_bucket()
INTEGER :: i
DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
!WRITE(*,*) i
CALL del_list(bucket(i)%next)
NULLIFY(bucket(i)%next)
END IF
END DO
END SUBROUTINE clear_bucket

END PROGRAM pedi_ana

 

샘플 자료

 

5 1 2

50 0

6 3 4
7 5 6
8 7 9
9 10 12
16 13 14
17 14 15
18 16 17
29 18 19
20 19 21
21 22 23
22 24 0
23 24 0
25 0 29
28 26 27
33 31 32
34 33 0
32 34 35
36 36 0
37 0 37
38 36 36

위 자료를 exam_pedi.dat로 저장

 

결과 파일

 

10 0 0 0 3 2 1 0
12 0 0 0 3 1 1 0
13 0 0 0 5 2 1 0
14 0 0 0 5 3 1 0
15 0 0 0 5 1 1 0
16 13 14 7 4 2 1 0
17 14 15 8 4 1 1 0
18 16 17 9 3 2 1 0
19 0 0 0 3 3 1 0
20 19 21 11 1 0 1 0
21 22 23 12 2 1 1 0
22 24 0 13 3 2 1 0
23 24 0 14 3 1 1 0
24 0 0 0 4 2 1 0
25 0 29 15 1 0 1 0
26 0 0 0 2 2 1 0
27 0 0 0 2 1 1 0
28 26 27 16 1 0 1 0
29 18 19 10 2 1 1 0
31 0 0 0 76 2 1 0
32 34 35 19 76 1 1 0
33 31 32 17 76 2 1 0
34 33 0 18 75 2 1 0
35 0 0 0 75 1 1 0
36 36 0 20 51 3 1 1
37 0 37 21 51 1 1 2
38 36 36 22 1 0 1 3
1 0 0 0 4 2 1 0
2 0 0 0 4 1 1 0
3 0 0 0 4 2 1 0
4 0 0 0 4 1 1 0
5 1 2 1 3 2 2 4
6 3 4 3 3 1 1 0
7 5 6 4 2 2 1 0
8 7 9 5 1 0 1 0
9 10 12 6 2 1 1 0

column 순서

1 - animal

2 - sire

3 -dam

4 - 읽은 순서

5 - 세대(세대가 무지 높다면 혈통에 조상이 자식으로 나온다는 얘기, loop)

- 정확히 어느 개체가 문제인지 확인하려면 renumber를 해서 sire, dam이 animal 보다 큰 수가 나오는 것을 찾아야 함

- 여기서는 renumber 하는 것을 생략

6- sex(0 = unknown, 1 = 암, 2 = 수, 3 = error)

- 3이 나왔다는 것은 해당 animal이 sire 컬럼에도 dam 컬럼에도 나온다는 얘기

7 - 몇 번 등장하는 지 횟수

8 - error ( 0: normal, 1: animal = sire, 2 : animal = dam, 3 : sire = dam,

4 : animal existtwice, but has different sire(dam)

 

정렬(유닉스 sort 명령어를 이용할 것)

 

결과 파일을 exam_pedi.gen이라고 할 경우

 

sort -nrk 5 exam_pedi.gen -o exam_pedi.gen.cyc

sort -nrk 6 exam_pedi.gen -o exam_pedi.gen.sex

sort -nrk 7 exam_pedi.gen -o exam_pedi.gen.cnt

sort -nrk 8 exam_pedi.gen -o exam_pedi.gen.dup

 

Batch 작업

 

pedi_ana %1 %2
sort -nrk 5 %2 -o %2.cyc
sort -nrk 6 %2 -o %2.sex
sort -nrk7 %2 -o %2.cnt
sort -nrk 8 %2 -o %2.dup

위 내용을 b_pedi_ana.bat로 저장하면 혈통파일 검사하고, 정렬을 한 번에 함

 

실행은 b_pedi_ana.bat input_file output_file

 

프로그램 컴파일 및 실행화면

 

 

 

관련 파일

1259939700_pedi_ana.zip
다운로드

1259939700_pedi_ana.zip

 

 

 

 

 

+ Recent posts