1. 1 2006 9 5 AWK HTML 2 1 [4] AWK Yahoo! : http://headlines.yahoo.co.jp/hl HTML HTML [4] HTML HTML ( ) HTML 3 2 Yahoo! Yahoo! <ul> </ul> ( ) <ul> <li><a href="[ URL]"> </a><small> (XXX ) - 15 ( )15 35 </small><br> <li><a href="[ URL]"> </a><small> (XXX )" - 15 ( )15 30 </small><br>
3. 2... </ul> (XXX ) 15 ( )15 35 (XXX ) 15 ( )15 30... [4] HTML HTML HTML HTML 1. <!--- CONTENTS_TITLE_TABLE ---> <b><font size=+1>xxx </font></b> <small> - 8 15 ( )15 40 </small></td> 2. <!--- OUTLINE_TABLE ---> (<ul> </ul> ) 3 AWK 1 AWK
3. 3 awk -f [ ] [ 1] [ 2]... [ 1] [ 2] awk cat [ 1] [ 2]... awk -f [ ] 2 FILENAME : FNR : NR : ARGIND : AWK AWK AWK ( ) C ARGV ARGC ARGC : AWK ARGV : (ARGV[0] ARGV[ARGC 1]) awk -f test.awk -v s=3 file1 file2 -f test.awk -v s=3 AWK ARGC = 3 ARGV[0]="awk", ARGV[1]="file1", ARGV[2]="file2"
4. 4 test.awk file1, file2 2,3 # BEGIN{ for(j=1;j<=argc;j++) printf "ARGV[%d]=%s\n",j-1,ARGV[j-1] { printf "(FILENAME,FNR,NR,ARGIND)" printf "=(%s,%d,%d,%d)\n",filename,fnr,nr,argind ARGV[0]=awk ARGV[1]=file1 ARGV[2]=file2 (FILENAME,FNR,NR,ARGIND)=(file1,1,1,1) (FILENAME,FNR,NR,ARGIND)=(file1,2,2,1) (FILENAME,FNR,NR,ARGIND)=(file1,3,3,1) (FILENAME,FNR,NR,ARGIND)=(file2,1,4,2) (FILENAME,FNR,NR,ARGIND)=(file2,2,5,2) (FILENAME,FNR,NR,ARGIND)=(file2,3,6,2) ARGIND 1 4 <b><font size=+1> getline sub() getline: $0 1, 0, 1 sub(r,s,c): c ( $0) r s ARGIND 1
5. 5 ##### ##### (ARGIND == 1 && $0 ~ /<b><font size=\+1>/){ sub(/.*<b><font size=\+1>/,"") sub(/<\/font><\/b>.*/,"") title=$0 getline sub(/.*<small> - /,"") sub(/<\/small>.*/,"") date=$0 / / + (1 ) \.*. = 1.* = 1 0 sub(/<\/small>.*/,"") </small> "" 5 2 <li> <ul> <li>...<br><li>...<br><li>...<br><li>...<br><li>...<br><br> <li>...<br><li>...<br><li>...<br><li>...<br><li>...<br><br> <li>...<br><li>...<br><li>...<br><li>...<br><li>...<br><br>... </ul>
5. 6 (... ) (5 ) <br> 2 1 <li> <ul> </ul> ( </ul> ) HTML <ul> </ul> <ul> </ul> getline getline <ul> </ul> [4] AWK ##### ##### ($0 ~ /<ul>/){ sub(/<ul>/,"") if($0!~ /<li>/) getline do{ # (1) 1 <li> # (2) getline while($0!~ /<\/ul>/) if($0 ~ /<li>/){ sub(/<\/ul>/,"") # (1) 1 <li> # (2) ( (1),(2)) 2 1 getline ($0 ~ /<ul>){ sub(/<ul>/,"") do{ getline
5. 7 # (1) 1 <li> # (2) while($0!~ /<\/ul>/) <ul> <li> </ul> (1),(2) (1),(2) 1 : ##### 2 ##### ($0 ~ /<ul>){ sub(/<ul>/,"") flag=0 do{ if(flag==1 $0!~ /<li>/) getline flag=1 if($0 ~ /<\/ul>/ && $0!~ /<li>/) break # (1) 1 <li> # (2) while($0!~ /<\/ul>/) if <ul> </ul> 1 if 2 if if (1),(2) (2) END
6. 8 6 5 (1) split() split(s, h, r): s r h <li>( )<br><li>( )<br><li>( )<br><br> str N=split(str,h,/<br>/) N h : N=5, h[1]="( )", h[2]="( )", h[3]="( )", h[4]="", h[5]="" 2 <br> split() <li>( )<br><li>( )<br><li>( )<br></ul> split() N=4, h[1]="( )", h[2]="( )", h[3]="( )", h[4]="</ul>" <br> </ul> split() sub(/<br>(<br> <\/ul>)? *$/,"",str) N=split(str,h,/<br>/) sub()
7. 9 (<br> <\/ul>) = <br> </ul> (<br> <\/ul>)? = (<br> </ul>) 0 1 <br>(<br> <\/ul>)? = <br> <br><br> <br></ul> <br>(<br> <\/ul>)? *$ = 0 7 5 (2) Yahoo! <li> <li><a href="http://headlines.yahoo.co.jp/..."> </a> <small> (XXX ) - 15 ( )15 35 </small> ( 1 ) (http:// ) HTML URL URL : <li><a href="data/20060815/00003.html"> </a> <small> (XXX ) - 15 ( )15 35 </small> URL HTML http://... <a> <li> <a>
7. 10 <a href="url"> : URL <a name=" "> : <a href="url" target=" "> : URL target 2 WWW WWW target target <a href="url"> a href "URL" > "URL" > match() substr() match(s,r): s r (s ) ( 0 ) RSTART (= r ) RLENGTH (= r ) substr(s,n,len): s n len ( s ) match() a match(str,/<a href=\"[^\"]+\">/) [^\"] = " 1 [^\"]+ = " 1 \"[^\"]+\" = " " (" ) <a href=\"[^\"]+\"> = <a href=" "> RSTART+RLENGTH
8. 11 substr(str,1,rstart+rlength-2) = a > substr(str,rstart+rlength-1) = a > str <li> if(match(str,/<a href=\"[^\"]+\">/)==0) print str else{ printf "%s",substr(str,1,rstart+rlength-2) printf " target=\"targetframe\"" printf "%s\n",substr(str,rstart+rlength-1) a target <li> j <li> if(j%5==0) printf "<br><br>\n"; j%5==0 j 5,10,15,... 5 5 if(j%5==0) printf "<br>( %d )<br><br>\n",j; 8 HTML [4] 6 2 7 BEGIN{ if(target=="") TARGET="yahoonews" # if(div=="") DIV=5 # N=0 # h[]:
8. 12 ##### ##### (ARGIND == 1 && $0 ~ /<b><font size=\+1>/){ sub(/.*<b><font size=\+1>/,"") sub(/<\/font><\/b>.*/,"") TITLE=$0 getline sub(/.*<small> - /,"") sub(/<\/small>.*/,"") DATE=$0 ##### ##### ($0 ~ /<ul>/){ sub(/<ul>/,"") if($0!~ /<li>/) getline do{ # (1) 1 <li> N=divideline($0,h,N) getline while($0!~ /<\/ul>/) if($0 ~ /<li>/){ sub(/<\/ul>/,"") # (1) 1 <li> N=divideline($0,h,N) ##### END ##### END{ putheader(date,title,n) print "<ul>" for(j=1;j<=n;j++){ put1list(h[j],target) if(j%div==0) printf "<br>( %d )<br><br>\n",j print "</ul>" putfooter() ##### ##### # <li> function put1list(str,target) { if(match(str,/<a href=\"[^\"]+\">/)==0) print str
8. 13 else{ printf "%s",substr(str,1,rstart+rlength-2) printf " target=\"%s\"",target printf "%s\n",substr(str,rstart+rlength-1) # <br> h ( h[1]~h[n] ) function divideline(str,h,n, tmp,j,m) { sub(/<br>(<br> <\/ul>)? *$/,"",str) M=split(str,tmp,/<br>/) for(j=1;j<=m;j++) if(tmp[j] ~ /<li>/) h[++n]=tmp[j] return N # function putheader(date,title,n) { printf "<html>\n" printf "<head>\n" printf "<meta http-equiv=\"content-type\"" printf " content=\"text/html; charset=euc-jp\">\n" printf "<title>yahoo News (%s)</title>",title printf "</head>\n" printf "<body>\n" printf "<h2>yahoo News (%s: %s : %d )</h2>\n",title,date,n #printf "<a href=\"%s\" target=\"%s\">(home)</a>\n",url,target printf "<hr>\n" # function putfooter() { print "<hr>" print "</body>" print "</html>" (= TARGET) (= DIV) Yahoo! HTML
9. 14 file1.html,file2.html,... yahoo2.awk awk -f yahoo2.awk file?.html > list.html HTML list.html awk -f yahoo2.awk -v TARGET="another" file?.html > list.html TARGET DIV AWK putheader() charset=euc-jp Unix Yahoo! EUC JP MS Windows EUC JP Shift JIS HTML ( MS IE) 9 AWK Yahoo! HTML 2 [1] AWK (2006) [2] AWK (2006) [3] AWK (2006) [4] AWK HTML (2006) [5] A.V. B.W. P.J. ( ) AWK (2004) ( 1989)
9. 15 [6] D.Dougherty A.Robbins ( ) sed & awk (1997) [7] AWK 256 (1993) [8] HTML (1996) [9] HTML & XHTML & CSS (2002) [10] WWW http://www.tohoho-web.com/www.htm